From b1b8eed8e685f741916423acde05dbca0a83cdbe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Jul 2024 18:08:31 +0200 Subject: [PATCH 01/52] PDEP-14: Dedicated string data type for pandas 3.0 (#58551) Co-authored-by: Simon Hawkins Co-authored-by: Irv Lustig Co-authored-by: William Ayd Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- web/pandas/pdeps/0014-string-dtype.md | 375 ++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 web/pandas/pdeps/0014-string-dtype.md diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version From 57780490333db611d4a3242bf090bb44fe38c8ba Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 11:36:25 +0200 Subject: [PATCH 02/52] TST / string dtype: add env variable to enable future_string and add test build (#58459) --- .github/workflows/unit-tests.yml | 7 ++++++- ci/run_tests.sh | 6 ++++++ pandas/core/config_init.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b54fe8f044c78..7b6f4e152f3a3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -85,6 +85,10 @@ jobs: env_file: actions-39.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + - name: "Future infer strings" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" @@ -103,7 +107,8 @@ jobs: LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 39ab0890a32d1..9b48778c41804 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,5 +16,11 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi +# temporarily let pytest always succeed (many tests are not yet passing in the +# build enabling the future string dtype) +if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then + PYTEST_CMD="$PYTEST_CMD || true" +fi + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..a6625d99eaa71 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From a494ed8349882d43a7215b33b1891e3ffbb6c61c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 19:20:59 +0200 Subject: [PATCH 03/52] REF (string dtype): rename using_pyarrow_string_dtype to using_string_dtype (#59320) --- pandas/_config/__init__.py | 2 +- pandas/_libs/lib.pyx | 4 +- pandas/core/construction.py | 16 ++--- pandas/core/dtypes/cast.py | 4 +- pandas/core/indexes/base.py | 4 +- pandas/core/internals/construction.py | 4 +- pandas/io/feather_format.py | 6 +- pandas/io/orc.py | 4 +- pandas/io/parquet.py | 4 +- pandas/io/parsers/arrow_parser_wrapper.py | 4 +- pandas/io/pytables.py | 10 +-- pandas/io/sql.py | 4 +- pandas/tests/arithmetic/test_object.py | 4 +- .../arrays/categorical/test_constructors.py | 4 +- pandas/tests/arrays/categorical/test_repr.py | 4 +- pandas/tests/base/test_misc.py | 4 +- pandas/tests/base/test_unique.py | 4 +- pandas/tests/extension/base/ops.py | 4 +- pandas/tests/extension/test_categorical.py | 4 +- .../frame/constructors/test_from_dict.py | 6 +- .../frame/constructors/test_from_records.py | 4 +- pandas/tests/frame/methods/test_fillna.py | 6 +- .../tests/frame/methods/test_interpolate.py | 6 +- pandas/tests/frame/methods/test_replace.py | 62 +++++-------------- pandas/tests/frame/test_api.py | 4 +- pandas/tests/frame/test_arithmetic.py | 6 +- pandas/tests/frame/test_constructors.py | 6 +- pandas/tests/frame/test_reductions.py | 8 +-- pandas/tests/frame/test_repr.py | 4 +- .../tests/indexes/base_class/test_formats.py | 6 +- .../indexes/categorical/test_category.py | 4 +- .../tests/indexes/categorical/test_formats.py | 4 +- pandas/tests/indexes/interval/test_formats.py | 4 +- pandas/tests/indexes/test_old_base.py | 4 +- pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_indexing.py | 8 +-- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/io/excel/test_readers.py | 6 +- pandas/tests/io/formats/test_format.py | 10 +-- pandas/tests/io/formats/test_to_string.py | 4 +- pandas/tests/io/json/test_pandas.py | 6 +- pandas/tests/reshape/test_pivot.py | 8 +-- pandas/tests/series/indexing/test_where.py | 4 +- pandas/tests/series/methods/test_reindex.py | 6 +- pandas/tests/series/methods/test_replace.py | 6 +- pandas/tests/series/test_formats.py | 4 +- 46 files changed, 122 insertions(+), 176 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 97784c924dab4..838b6affd2836 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -52,6 +52,6 @@ def using_nullable_dtypes() -> bool: return _mode_options["nullable_dtypes"] -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7656e8d986117..1c2bba031e523 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -2725,7 +2725,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f8250ae475a10..3de28ea242ce8 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,7 +19,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( @@ -566,11 +566,7 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") @@ -604,14 +600,10 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): + if object_index and using_string_dtype() and is_string_dtype(subarr): # Avoid inference when string option is set subarr = data - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b72293b52df06..af4c7c2c7c4f8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -798,7 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6822c2c99427e..76ddfffaa8a4d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,7 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -7011,7 +7011,7 @@ def insert(self, loc: int, item) -> Index: out = Index._with_infer(new_values, name=self.name) if ( - using_pyarrow_string_dtype() + using_string_dtype() and is_string_dtype(out.dtype) and new_values.dtype == object ): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 609d2c9a7a285..14d7cadd21400 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,7 +13,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -375,7 +375,7 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): dtype = StringDtype(storage="pyarrow_numpy") obj_columns = list(values) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d0aaf83b84cb2..68c73483add3f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,7 +6,7 @@ Any, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -120,7 +120,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): + if dtype_backend is lib.no_default and not using_string_dtype(): return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) @@ -137,7 +137,7 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) else: raise NotImplementedError diff --git a/pandas/io/orc.py b/pandas/io/orc.py index fed9463c38d5d..5706336b71697 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,7 +9,7 @@ Literal, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -127,7 +127,7 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - if using_pyarrow_string_dtype(): + if using_string_dtype(): types_mapper = arrow_string_types_mapper() else: types_mapper = None diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9570d6f8b26bd..cc33c87dfc55a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -12,7 +12,7 @@ import warnings from warnings import catch_warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -257,7 +257,7 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 890b22154648e..c774638fd73f7 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -295,7 +295,7 @@ def read(self) -> DataFrame: dtype_mapping = _arrow_dtype_mapping() dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 13c2f10785124..12bb93a63f850 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -31,7 +31,7 @@ config, get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -3224,7 +3224,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): result = result.astype("string[pyarrow_numpy]") return result @@ -3293,7 +3293,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): df = df.astype("string[pyarrow_numpy]") dfs.append(df) @@ -4679,9 +4679,9 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( + if using_string_dtype() and is_string_array( values, # type: ignore[arg-type] skipna=True, ): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3e17175167f25..03ef1792f1fb8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,7 +32,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -2215,7 +2215,7 @@ def read_table( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper arrow_string_types_mapper() diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..884e6e002800e 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -303,7 +303,7 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") + @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 373f1c95463fc..6813683cb5219 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import ( is_float_dtype, @@ -449,7 +449,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ef0315130215c..e2e5d47f50209 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -78,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 65e234e799353..3e0d8b1afedc0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -83,7 +83,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index d3fe144f70cfc..8314fa56b5bda 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -100,7 +100,7 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") +@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..fad2560265d21 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_string_dtype @@ -37,7 +37,7 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: + if using_string_dtype() and result is not None: import pyarrow as pa result = ( # type: ignore[assignment] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..135ea67c924d0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -18,7 +18,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -103,7 +103,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..4237e796e052e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -44,9 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) + @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 3622571f1365d..4eaf32798ca60 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,7 +6,7 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -59,7 +59,7 @@ def test_from_records_with_datetimes(self): tm.assert_frame_equal(result, expected) @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + using_string_dtype(), reason="dtype checking logic doesn't work" ) def test_from_records_sequencelike(self): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 89c50a8c21e1c..774e938e887b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -91,7 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -664,7 +664,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 252b950004bea..7bde3041d46c9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -70,7 +70,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert orig.squeeze()[1] == 1.5 @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( @@ -114,7 +114,7 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8bfa98042eb07..0884c091ba96a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -30,9 +30,7 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -299,9 +297,7 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -310,9 +306,7 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -341,9 +335,7 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -359,9 +351,7 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -566,9 +556,7 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -580,9 +568,7 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -946,9 +932,7 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -977,9 +961,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -1003,9 +985,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1112,9 +1092,7 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) @@ -1126,9 +1104,7 @@ def test_replace_swapping_bug(self, using_infer_string): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_period(self): d = { "fname": { @@ -1165,9 +1141,7 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_datetime(self): d = { "fname": { @@ -1393,9 +1367,7 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1672,9 +1644,7 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c7b444045a0f2..339800538f47b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context import pandas as pd @@ -113,7 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0593de7556406..e2d469f1124d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -253,9 +253,7 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cae2f6e81d384..b45eca127b3e4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,7 +21,7 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 @@ -327,7 +327,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -335,7 +335,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66145c32c18d7..45d06c56d353f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -465,7 +465,7 @@ def test_mixed_ops(self, op): getattr(df, op)() @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + using_string_dtype(), reason="sum doesn't work for arrow strings" ) def test_reduce_mixed_frame(self): # GH 6806 @@ -1960,9 +1960,7 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) +@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 776007fb9691d..f7700af6beea0 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( NA, @@ -176,7 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") + @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index f30b578cfcf56..955e3be107f75 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..166e628ae4b3e 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -196,7 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 522ca1bc2afde..e8489e4ad8161 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -3,7 +3,7 @@ """ import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -19,7 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 3b8e18463160f..d20611a61b154 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -42,7 +42,7 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 1787379b0faee..bfbb0c3dda5c5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp @@ -426,7 +426,7 @@ def test_insert_base(self, index): assert index[0:4].equals(result) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="completely different behavior, tested elsewher", ) def test_insert_out_of_bounds(self, index): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0e32399b131c3..31840cb84b7c4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -834,7 +834,7 @@ def replacer(self, how, from_key, to_key): return replacer # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") + @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 57f45f867254d..e032936d09ce4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import IndexingError @@ -461,9 +461,7 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -689,7 +687,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 0cd1390d41461..d33719f3e2115 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,7 +12,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas.compat.numpy import np_version_gt2 @@ -1262,7 +1262,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8da8535952dcf..8d36dc7520019 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -16,7 +16,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -659,9 +659,7 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" - ) + @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0ca29c219b55b..535ef76cb12f4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -1396,9 +1396,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1773,9 +1771,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 2e5a5005cb076..164e514262603 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -851,7 +851,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5279f3f1cdfbe..2157498aea95e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,7 +13,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -1492,7 +1492,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1751,7 +1751,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 18a449b4d0c67..7b27d19483bd2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import PerformanceWarning @@ -2611,7 +2611,7 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2627,7 +2627,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2641,7 +2641,7 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index c978481ca9988..013386202c966 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_integer @@ -232,7 +232,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") +@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 6f0c8d751a92a..aa44eccf67446 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -24,9 +24,7 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) +@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b0f4e233ba5eb..c0191abced797 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -391,7 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -731,7 +731,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index a1c5018ea7961..f684e8fc1e724 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -145,7 +145,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO: investigate why this is failing" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From 06dbb7abd7c52425da56fd088e5c9097a3b3a274 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 17:14:00 +0200 Subject: [PATCH 04/52] TST (string dtype): clean-up xpasssing tests with future string dtype (#59323) --- pandas/tests/arithmetic/test_object.py | 3 --- pandas/tests/base/test_unique.py | 5 +---- pandas/tests/frame/constructors/test_from_dict.py | 3 ++- pandas/tests/frame/constructors/test_from_records.py | 4 +--- pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 11 ++++++----- pandas/tests/frame/methods/test_interpolate.py | 1 + pandas/tests/frame/test_arithmetic.py | 3 --- pandas/tests/indexes/interval/test_formats.py | 7 ++----- pandas/tests/indexing/test_coercion.py | 4 ---- pandas/tests/indexing/test_indexing.py | 4 ---- pandas/tests/series/methods/test_reindex.py | 3 --- pandas/tests/series/methods/test_replace.py | 1 - pandas/tests/series/test_formats.py | 2 +- 14 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 884e6e002800e..4b5156d0007bb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -303,7 +301,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 8314fa56b5bda..1add56b47b363 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 4237e796e052e..fc7c03dc25839 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -44,7 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -108,6 +108,7 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 4eaf32798ca60..58e47ba48f894 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -58,9 +58,7 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_string_dtype(), reason="dtype checking logic doesn't work" - ) + @pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work") def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 774e938e887b4..e2baa2567f5b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -91,6 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame @@ -664,6 +665,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index fcb7677f03f27..c3d02a07f397e 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -15,6 +15,7 @@ from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, @@ -360,7 +361,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +399,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 7bde3041d46c9..bbb5e59e4a274 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -69,6 +69,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + # TODO(infer_string) raise proper TypeError in case of string dtype @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e2d469f1124d3..f68785a354d7e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -253,7 +251,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index d20611a61b154..f858ae137ca4e 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, DatetimeIndex, @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 markers = Series( - ["foo", "bar"], + [1, 2], index=IntervalIndex( [ Interval(left, right) @@ -59,7 +56,7 @@ def test_repr_floats(self): ), ) result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" + expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 31840cb84b7c4..c0a62ecb06f56 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -833,8 +831,6 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e032936d09ce4..162c2c58ff744 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -461,7 +459,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -687,7 +684,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index aa44eccf67446..0923a2d42ce10 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -24,7 +22,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c0191abced797..850740fac907d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,7 +391,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index f684e8fc1e724..4939f3221d268 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -145,7 +145,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO(infer_string): investigate failure" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From 925c21c9b8740eadf611ae7e2922c354f92cbbb0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 21:40:31 +0200 Subject: [PATCH 05/52] String dtype: rename the storage options and add `na_value` keyword in `StringDtype()` (#59330) * rename storage option and add na_value keyword * update init * fix propagating na_value to Array class + fix some tests * fix more tests * disallow pyarrow_numpy as option + fix more cases of checking storage to be pyarrow_numpy * restore pyarrow_numpy as option for now * linting * try fix typing * try fix typing * fix dtype equality to take into account the NaN vs NA * fix pickling of dtype * fix test_convert_dtypes * update expected result for dtype='string' * suppress typing error with _metadata attribute --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/__init__.py | 4 +- pandas/core/arrays/arrow/array.py | 6 +- pandas/core/arrays/string_.py | 89 ++++++++++++++----- pandas/core/arrays/string_arrow.py | 11 ++- pandas/core/construction.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/encoding.py | 3 +- pandas/core/reshape/merge.py | 3 +- pandas/core/tools/numeric.py | 9 +- pandas/io/_util.py | 6 +- pandas/tests/arrays/string_/test_string.py | 83 +++++++++++------ .../tests/arrays/string_/test_string_arrow.py | 4 +- pandas/tests/extension/base/methods.py | 8 +- pandas/tests/extension/test_string.py | 44 +++++---- .../frame/methods/test_convert_dtypes.py | 1 + pandas/tests/series/test_constructors.py | 7 +- pandas/tests/strings/__init__.py | 2 +- 20 files changed, 186 insertions(+), 107 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c2bba031e523..1222b33aac3c1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2728,7 +2728,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 87d419e2db8dd..994b351acf42c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -527,14 +527,14 @@ def shares_memory(left, right) -> bool: if ( isinstance(left, ExtensionArray) and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) if ( isinstance(right, ExtensionArray) and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f2b8aa75ca5bf..a156042ac0c0e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -570,10 +570,8 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ): + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..c40f5b8f58d9e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -80,8 +83,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -108,30 +113,67 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if using_string_dtype(): + storage = "pyarrow" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + + if storage == "pyarrow_numpy": + # TODO raise a deprecation warning + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + self.storage = storage + self._na_value = na_value + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + if other == self.name: + return True + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -176,6 +218,7 @@ def construct_from_string(cls, string) -> Self: elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # TODO deprecate return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -200,7 +243,7 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray else: return ArrowStringArrayNumpySemantics @@ -212,13 +255,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray + + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 50527dace0b82..94f6f9064885e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -125,6 +125,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() @@ -134,7 +135,7 @@ def __init__(self, values) -> None: values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -179,10 +180,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -598,7 +596,8 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + _storage = "pyarrow" + _na_value = np.nan @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3de28ea242ce8..748b9e4947bec 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -569,7 +569,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype("pyarrow", na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -606,7 +606,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index af4c7c2c7c4f8..5d4e56cd8f800 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 76ddfffaa8a4d..11d2436b0e095 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5620,9 +5620,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 14d7cadd21400..144416fc11691 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -376,7 +376,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..85c10f1166577 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -13,6 +13,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -260,7 +261,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 646f40f6141d8..dc2df25c3f786 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2473,8 +2473,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 09652a7d8bc92..ca703e0362611 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -235,7 +238,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) except (ValueError, TypeError): if errors == "raise": @@ -250,7 +253,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..2dc3b0e6c80ef 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,6 +2,8 @@ from typing import Callable +import numpy as np + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -29,6 +31,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), }.get diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 320bdca60a932..3ef7862d739cd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -19,13 +19,6 @@ ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA - - @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -38,24 +31,45 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -67,7 +81,7 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): @@ -224,7 +238,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -243,7 +257,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -270,7 +284,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -292,7 +306,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -386,7 +400,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -443,7 +457,7 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @@ -492,7 +506,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -524,7 +538,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) + assert result.loc[2, "a"] is result["a"].dtype.na_value @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") @@ -558,10 +572,10 @@ def test_arrow_load_from_zero_chunks( def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -575,10 +589,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -612,6 +626,19 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) +def test_value_counts_sort_false(dtype): + if dtype.na_value is np.nan: + exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" + else: + exp_dtype = "Int64" + ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) + result = ser.value_counts(sort=False) + expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + def test_memory_usage(dtype, arrow_string_storage): # GH 33963 @@ -635,7 +662,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -675,7 +702,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d7811b6fed883..06013c3d11664 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,6 +29,8 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): if using_infer_string and string_storage != "pyarrow_numpy": request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + if string_storage == "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -260,6 +262,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c803a8113b4a4..5cb2c14e4c841 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2d5a134f8560a..f3eec142c1968 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -95,9 +95,15 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + # TODO(infer_string) deprecate this + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type @@ -143,28 +149,21 @@ def _get_expected_exception( self, op_name: str, obj, other ) -> type[Exception] | None: if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if ( + isinstance(obj, pd.Series) + and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + elif ( + isinstance(other, pd.Series) + and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError return TypeError elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": return NotImplementedError return TypeError elif op_name in ["__mul__", "__rmul__"]: @@ -178,10 +177,7 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": import pyarrow as pa # TODO: better to re-raise as TypeError? @@ -193,7 +189,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) @@ -201,10 +197,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..9cbbebf35b2d1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -18,6 +18,7 @@ def test_convert_dtypes( # Just check that it works for DataFrame here if using_infer_string: string_storage = "pyarrow_numpy" + df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 387be8398e4b2..391c9361080d8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2133,9 +2133,12 @@ def test_series_string_inference_array_string_dtype(self): tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + expected = Series(["a", "b"], dtype="string[pyarrow]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..e94f656fc9823 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -7,7 +7,7 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 From 431b246ad6c3f31839bc3715460857fef23d0a0f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:53:24 -0400 Subject: [PATCH 06/52] TST (string dtype): xfail all currently failing tests with future.infer_string (#59329) * TST (string dtype): xfail all currently failing tests with future.infer_string * more xfails * more xfails * add missing strict=False * also run slow and single cpu tests * fix single_cpu tests * xfail some slow tests * stop suppressing non-zero exit code from pytest on string CI build * remove accidentally added xlsx file --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 1 - ci/run_tests.sh | 6 ------ pandas/tests/apply/test_frame_apply.py | 6 ++++++ pandas/tests/apply/test_numba.py | 4 ++++ pandas/tests/apply/test_str.py | 4 ++++ pandas/tests/arrays/categorical/test_analytics.py | 3 +++ pandas/tests/arrays/categorical/test_api.py | 3 +++ .../tests/arrays/categorical/test_constructors.py | 1 + pandas/tests/arrays/floating/test_arithmetic.py | 3 +++ pandas/tests/arrays/integer/test_arithmetic.py | 3 +++ pandas/tests/arrays/masked/test_function.py | 3 +++ pandas/tests/copy_view/test_array.py | 3 +++ pandas/tests/copy_view/test_astype.py | 6 ++++++ pandas/tests/copy_view/test_constructors.py | 3 +++ pandas/tests/copy_view/test_functions.py | 8 ++++++++ pandas/tests/copy_view/test_internals.py | 3 +++ pandas/tests/copy_view/test_interp_fillna.py | 4 ++++ pandas/tests/copy_view/test_methods.py | 4 ++++ pandas/tests/copy_view/test_replace.py | 5 +++++ pandas/tests/dtypes/test_dtypes.py | 3 +++ pandas/tests/extension/test_string.py | 6 ++++++ pandas/tests/frame/indexing/test_coercion.py | 3 +++ pandas/tests/frame/indexing/test_indexing.py | 7 +++++++ pandas/tests/frame/indexing/test_insert.py | 3 +++ pandas/tests/frame/indexing/test_setitem.py | 8 ++++++++ pandas/tests/frame/indexing/test_where.py | 5 +++++ pandas/tests/frame/indexing/test_xs.py | 3 +++ pandas/tests/frame/methods/test_combine_first.py | 3 +++ pandas/tests/frame/methods/test_convert_dtypes.py | 4 ++++ pandas/tests/frame/methods/test_cov_corr.py | 3 +++ pandas/tests/frame/methods/test_dropna.py | 3 +++ pandas/tests/frame/methods/test_dtypes.py | 3 +++ pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 5 +++++ pandas/tests/frame/methods/test_quantile.py | 8 ++++++++ pandas/tests/frame/methods/test_replace.py | 3 +++ pandas/tests/frame/methods/test_reset_index.py | 3 +++ pandas/tests/frame/methods/test_to_csv.py | 7 +++++++ .../tests/frame/methods/test_to_dict_of_blocks.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 3 +++ pandas/tests/frame/test_arrow_interface.py | 4 ++++ pandas/tests/frame/test_block_internals.py | 5 +++++ pandas/tests/frame/test_constructors.py | 1 + pandas/tests/frame/test_query_eval.py | 3 +++ pandas/tests/frame/test_reductions.py | 4 ++++ pandas/tests/frame/test_stack_unstack.py | 4 ++++ pandas/tests/frame/test_unary.py | 3 +++ pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++++ pandas/tests/groupby/aggregate/test_cython.py | 4 ++++ pandas/tests/groupby/aggregate/test_other.py | 3 +++ pandas/tests/groupby/methods/test_describe.py | 4 ++++ pandas/tests/groupby/methods/test_nth.py | 3 +++ pandas/tests/groupby/methods/test_quantile.py | 4 ++++ pandas/tests/groupby/methods/test_size.py | 3 +++ pandas/tests/groupby/methods/test_value_counts.py | 4 ++++ pandas/tests/groupby/test_categorical.py | 3 +++ pandas/tests/groupby/test_groupby.py | 6 ++++++ pandas/tests/groupby/test_groupby_dropna.py | 3 +++ pandas/tests/groupby/test_grouping.py | 4 ++++ pandas/tests/groupby/test_pipe.py | 4 ++++ pandas/tests/groupby/test_raises.py | 4 ++++ pandas/tests/groupby/test_reductions.py | 3 +++ pandas/tests/groupby/test_timegrouper.py | 3 +++ pandas/tests/groupby/transform/test_transform.py | 6 ++++++ pandas/tests/indexes/base_class/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/indexing/test_iloc.py | 3 +++ pandas/tests/indexing/test_indexing.py | 3 +++ pandas/tests/indexing/test_loc.py | 1 + pandas/tests/interchange/test_impl.py | 4 ++++ pandas/tests/io/excel/test_readers.py | 1 + pandas/tests/io/excel/test_writers.py | 3 +++ pandas/tests/io/formats/style/test_to_latex.py | 3 +++ pandas/tests/io/json/test_pandas.py | 9 +++++++++ pandas/tests/io/parser/common/test_chunksize.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 3 +++ .../tests/io/parser/common/test_file_buffer_url.py | 3 +++ pandas/tests/io/parser/common/test_index.py | 3 +++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 7 +++++++ pandas/tests/io/parser/test_c_parser_only.py | 3 +++ pandas/tests/io/parser/test_converters.py | 3 +++ pandas/tests/io/parser/test_mangle_dupes.py | 3 +++ pandas/tests/io/parser/test_na_values.py | 6 ++++++ pandas/tests/io/parser/test_parse_dates.py | 5 +++++ pandas/tests/io/parser/test_python_parser_only.py | 4 ++++ pandas/tests/io/parser/test_read_fwf.py | 3 +++ pandas/tests/io/parser/test_upcast.py | 3 +++ pandas/tests/io/pytables/test_append.py | 7 ++++++- pandas/tests/io/pytables/test_categorical.py | 7 ++++++- pandas/tests/io/pytables/test_complex.py | 6 ++++++ pandas/tests/io/pytables/test_errors.py | 7 ++++++- pandas/tests/io/pytables/test_file_handling.py | 7 ++++++- pandas/tests/io/pytables/test_put.py | 7 ++++++- pandas/tests/io/pytables/test_read.py | 7 ++++++- pandas/tests/io/pytables/test_round_trip.py | 7 ++++++- pandas/tests/io/pytables/test_select.py | 7 ++++++- pandas/tests/io/pytables/test_store.py | 7 ++++++- pandas/tests/io/pytables/test_timezones.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 6 ++++++ pandas/tests/io/test_clipboard.py | 6 ++++++ pandas/tests/io/test_common.py | 7 +++++++ pandas/tests/io/test_compression.py | 3 +++ pandas/tests/io/test_feather.py | 4 ++++ pandas/tests/io/test_fsspec.py | 4 ++++ pandas/tests/io/test_gcs.py | 3 +++ pandas/tests/io/test_html.py | 6 ++++++ pandas/tests/io/test_http_headers.py | 3 +++ pandas/tests/io/test_orc.py | 11 ++++++++--- pandas/tests/io/test_parquet.py | 6 +++++- pandas/tests/io/test_sql.py | 11 ++++++++--- pandas/tests/io/test_stata.py | 13 +++++++++++++ pandas/tests/io/xml/test_xml.py | 3 +++ pandas/tests/io/xml/test_xml_dtypes.py | 4 ++++ pandas/tests/reductions/test_reductions.py | 5 +++++ pandas/tests/resample/test_resampler_grouper.py | 3 +++ pandas/tests/reshape/concat/test_concat.py | 3 +++ pandas/tests/reshape/merge/test_merge_asof.py | 3 +++ pandas/tests/reshape/test_from_dummies.py | 3 +++ pandas/tests/reshape/test_melt.py | 10 ++++++++++ pandas/tests/reshape/test_pivot.py | 2 ++ pandas/tests/reshape/test_union_categoricals.py | 3 +++ pandas/tests/series/accessors/test_dt_accessor.py | 6 ++++++ pandas/tests/series/indexing/test_indexing.py | 3 +++ pandas/tests/series/indexing/test_setitem.py | 6 ++++++ pandas/tests/series/methods/test_info.py | 3 +++ pandas/tests/series/methods/test_replace.py | 1 + pandas/tests/series/methods/test_to_csv.py | 3 +++ pandas/tests/series/methods/test_unstack.py | 3 +++ pandas/tests/series/test_arithmetic.py | 3 +++ pandas/tests/series/test_logical_ops.py | 3 +++ pandas/tests/test_algos.py | 4 ++++ 131 files changed, 539 insertions(+), 23 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7b6f4e152f3a3..3ba61e39316af 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -87,7 +87,6 @@ jobs: pandas_copy_on_write: "warn" - name: "Future infer strings" env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 9b48778c41804..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,11 +16,5 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -# temporarily let pytest always succeed (many tests are not yet passing in the -# build enabling the future string dtype) -if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then - PYTEST_CMD="$PYTEST_CMD || true" -fi - echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..a774ae214e09a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -61,6 +63,7 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): @@ -1169,6 +1172,7 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1286,6 +1290,7 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1462,6 +1467,7 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..aee9100702350 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -17,6 +19,7 @@ def apply_axis(request): return request.param +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -40,6 +43,7 @@ def test_numba_vs_python_string_index(): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 17e8322dc40e1..8956aed5e9ceb 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_number from pandas import ( @@ -86,6 +88,7 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -144,6 +147,7 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c2c53fbc4637e..7c7a236ab83cd 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -296,6 +298,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index a939ee5f6f53f..1d948b7495a43 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY311 from pandas import ( @@ -156,6 +158,7 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6813683cb5219..60d78a906b528 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -743,6 +743,7 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..768d3c1449fa4 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -122,6 +124,7 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8acd298f37a07..8aa8c2db940b4 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -172,6 +174,7 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index b259018cd6121..81338bca460a6 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -58,6 +60,7 @@ def test_tolist(data): tm.assert_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..5d0efdc149004 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -157,6 +159,7 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d462ce3d3187d..14fd8fb5f911e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -96,6 +98,7 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -113,6 +116,7 @@ def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -221,6 +225,7 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -240,6 +245,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes(using_copy_on_write): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1aa458a625028..866b1964a334f 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -283,6 +285,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 56e4b186350f2..4ec1d023c8ba7 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -12,6 +14,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -36,6 +39,7 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -197,6 +201,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -260,6 +265,7 @@ def test_merge_on_index(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -313,6 +319,7 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") @@ -346,6 +353,7 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e..6f7198520d22e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -76,6 +78,7 @@ def test_switch_options(): @td.skip_array_manager_invalid_test +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index ddc5879a56d54..338b76cbf1e7a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( NA, ArrowDtype, @@ -135,6 +137,7 @@ def test_interp_fill_functions_inplace( assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_cleaned_fill_method(using_copy_on_write): # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) @@ -156,6 +159,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 5d1eefccbb1e7..d870342ef9e29 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SettingWithCopyWarning import pandas as pd @@ -950,6 +952,7 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_infer_objects(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -1177,6 +1180,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 0beac439fbb58..32da870c6d2e3 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -10,6 +12,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -63,6 +66,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -350,6 +354,7 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index de1ddce724a5b..e522d2666a2dc 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -5,6 +5,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.core.dtypes.base import _registry as registry @@ -959,6 +961,7 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered1, ordered2): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index f3eec142c1968..5a72b2244d2bf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -21,6 +21,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -28,6 +30,10 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def maybe_split_array(arr, chunked): if not chunked: diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index ba0d8613b6228..9d20821ae8bc6 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -97,6 +99,7 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 22d9c7f26a57c..09f359df37dd1 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import iNaT from pandas.errors import ( InvalidIndexError, @@ -180,6 +182,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() @@ -515,6 +518,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] @@ -1181,6 +1185,7 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1203,6 +1208,7 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1956,6 +1962,7 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 7e702bdc993bd..82b75459f08d0 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning from pandas import ( @@ -60,6 +62,7 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index a58dd701f0f22..bce3cb5dacabe 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.base import _registry as ea_registry @@ -146,6 +148,7 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) @@ -161,6 +164,7 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) @@ -200,6 +204,7 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] @@ -672,6 +677,7 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") @@ -706,6 +712,7 @@ def test_setitem_ea_dtype_rhs_series(self): # TODO(ArrayManager) set column with 2d column array, see #44788 @td.skip_array_manager_not_yet_implemented + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -929,6 +936,7 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d36d0471f02f..dfbc3b4ca33ad 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -46,6 +48,7 @@ def is_ok(s): class TestDataFrameIndexingWhere: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -97,6 +100,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -172,6 +176,7 @@ def test_where_invalid(self): df.mask(0) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index be809e3a17c8e..4ca435fa5acc5 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SettingWithCopyError from pandas import ( @@ -77,6 +79,7 @@ def test_xs( else: assert (expected == 5).all() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 8aeab5dacd8b4..a4ee0b08e1e66 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -30,6 +32,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 9cbbebf35b2d1..91fa81b5bee2e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm @@ -181,6 +183,7 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -195,6 +198,7 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 04a08c8b9bc52..721ec4e43eb1b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -326,6 +328,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..87a43b4e67c3f 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -182,6 +184,7 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index ab632ac17318e..2556c44e63a77 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -142,6 +144,7 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..d767e35878b52 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -126,6 +126,7 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -370,6 +371,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index c3d02a07f397e..475632667a87a 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( IS64, PYPY, @@ -433,6 +435,7 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) @@ -493,6 +496,7 @@ def test_info_categorical(): df.info(buf=buf) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") def test_info_int_columns(): # GH#37245 @@ -516,6 +520,7 @@ def test_info_int_columns(): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0f27eae1a3bfc..ec070467b242e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -352,6 +354,7 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -405,6 +408,7 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -675,6 +679,7 @@ def test_quantile_nat(self, interp_method, request, using_array_manager, unit): ) tm.assert_frame_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method @@ -913,6 +918,7 @@ def test_quantile_ea_scalar(self, request, obj, index): else: tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis", [ @@ -931,6 +937,7 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis, expected_dtype", [ @@ -949,6 +956,7 @@ def test_empty_datelike( ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "expected_data, expected_index, axis", [ diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0884c091ba96a..fd8039975a514 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,6 +624,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -1443,6 +1444,7 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1538,6 +1540,7 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fbf36dbc4fb02..44d7bbf57fe0a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -644,6 +646,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 250567eafc670..bed8b030bc72a 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError import pandas as pd @@ -420,6 +422,7 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 @@ -432,6 +435,7 @@ def test_to_csv_chunksize(self): result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -528,6 +532,7 @@ def test_to_csv_headers(self): assert return_value is None tm.assert_frame_equal(to_df, recons) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index @@ -721,6 +726,7 @@ def test_to_csv_withcommas(self): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -810,6 +816,7 @@ def test_to_csv_dups_cols(self): result.columns = df.columns tm.assert_frame_equal(result, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self): # GH3457 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f64cfd5fe6a2d..42858aa412810 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -35,6 +37,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f68785a354d7e..4e32dc0cb3f98 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -1560,6 +1562,7 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index 098d1829b973c..dc163268f64b9 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,6 +2,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -9,6 +11,7 @@ pa = pytest.importorskip("pyarrow") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") def test_dataframe_arrow_interface(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) @@ -31,6 +34,7 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 712494ef15f97..9bd61736624ca 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -183,6 +185,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types @@ -214,6 +217,7 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert @@ -434,6 +438,7 @@ def test_update_inplace_sets_valid_block_values(using_copy_on_write): assert df.isnull().sum().sum() == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b45eca127b3e4..aab900f6eef47 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1963,6 +1963,7 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 2c807c72582c5..13232a0909c5b 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -726,6 +728,7 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 45d06c56d353f..db15461ba0234 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -608,6 +608,7 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -1068,6 +1069,7 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): @@ -1117,6 +1119,7 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): @@ -1359,6 +1362,7 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index d8b92091260a3..75ef348b75deb 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.errors import PerformanceWarning @@ -1646,6 +1648,7 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1889,6 +1892,7 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 850c92013694f..6f7453d0d1655 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -126,6 +128,7 @@ def test_pos_object(self, df): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6223a153df358..b267347aaf030 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -333,6 +335,7 @@ def aggfun_1(ser): assert len(result) == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1098,6 +1101,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5c99882cef6d2..fbbace54a3444 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -93,6 +95,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -161,6 +164,7 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 00136e572288e..35ee6c388d5a8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -306,6 +308,7 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index a2440e09dfc02..34b046bff7c91 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -71,6 +73,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -272,6 +275,7 @@ def test_describe(self, df, gb, gni): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index a8ed9e9d52021..344258257ba80 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -704,6 +706,7 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 361a8c27fbf9d..d3bc815402ade 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -168,6 +170,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) @@ -250,6 +253,7 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 93a4e743d0d71..5b4c08fc24411 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -108,6 +110,7 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8e25177368d8b..dcc0b39f0006c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -285,6 +287,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -372,6 +375,7 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f60ff65536f20..1073dda954563 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -338,6 +340,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 44d6340e55507..4381b36b0b73a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -1597,6 +1599,7 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -2691,6 +2694,7 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2707,6 +2711,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2864,6 +2869,7 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_none_column_name(): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9155f2cccf117..9c01e017dd29c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -97,6 +99,7 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index d763b67059375..6e3ae2f7d8fae 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -842,6 +844,7 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -986,6 +989,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..1044c83e3e56b 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -8,6 +11,7 @@ import pandas._testing as tm +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0b451ce73db89..64780d0ba03d8 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -117,6 +119,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( how, by, groupby_series, groupby_func, df_with_string_col @@ -216,6 +219,7 @@ def func(x): getattr(gb, how)(func) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 25b0f80639cff..f67051de6e8c7 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -455,6 +457,7 @@ def test_max_min_non_numeric(): assert "ss" in result +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_max_min_object_multiple_columns(using_array_manager): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 8ef7c2b8ce859..69542b934e65f 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,6 +10,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -73,6 +75,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index fd9bd5cc55538..a5433d5496b0b 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -497,6 +499,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_transform_nuisance_raises(df): # case that goes through _transform_item_by_item @@ -579,6 +582,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform @@ -846,6 +850,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -1219,6 +1224,7 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 3ef3f3ad4d3a2..2176aa52b17f4 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Index, @@ -231,6 +233,7 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index bfbb0c3dda5c5..0e6722ff50d1e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -846,6 +846,7 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 43dd3812e8b7d..3fd9498e21a73 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1216,6 +1218,7 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iloc_setitem_multicolumn_to_datetime(self): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 162c2c58ff744..d2c8454019a5e 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -561,6 +563,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d33719f3e2115..41431c0e2813b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -642,6 +642,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 25418b8bb2b37..d1a15dc93f702 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -418,6 +420,7 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") @@ -434,6 +437,7 @@ def test_non_str_names(): assert names == ["0"] +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8d36dc7520019..a83176cfe28f8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -595,6 +595,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 292eab2d88152..7ecddb18a61ec 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency @@ -1311,6 +1313,7 @@ def test_freeze_panes(self, path): result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 7f1443c3ee66b..b29c880d1f823 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -729,6 +731,7 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2157498aea95e..99c45c61fc8a4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -208,6 +208,7 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -285,6 +286,7 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -712,6 +714,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -816,6 +819,7 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -828,6 +832,7 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -898,6 +903,7 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -958,6 +964,7 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame @@ -1063,6 +1070,7 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan, downcast=False) tm.assert_frame_equal(res, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -2027,6 +2035,7 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 9f42cf674b0a7..7b70601addcad 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -228,6 +230,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 7ffc49e941c14..95a7664304889 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -915,6 +917,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a7a8d031da215..1f5021c8a24cc 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -67,6 +69,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 038c684c90c9e..0121af53f1aa4 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -85,6 +87,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ce02e752fb90b..9c1ae6f9b4236 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas as pd @@ -54,6 +56,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -301,6 +304,7 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,6 +320,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -458,6 +463,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -501,6 +507,7 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 27d7bc0bb6c07..1501479510e17 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,6 +17,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -183,6 +185,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtypes(c_parser_only): parser = c_parser_only data = """\ diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7f3e45324dbd2..a3c6dc8fd0898 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -184,6 +186,7 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 1d245f81f027c..32a8d3b81f470 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,6 +7,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm @@ -118,6 +120,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ca106fa772e82..1a3b7b37bf66b 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -258,6 +260,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -432,6 +435,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", @@ -626,6 +630,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -677,6 +682,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 623657b412682..be2015fca27d1 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,6 +16,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs import parsing import pandas as pd @@ -1797,6 +1799,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -2051,6 +2054,7 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2200,6 +2204,7 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index dbd474c6ae0b9..9e7530906afa3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -17,6 +17,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( ParserError, ParserWarning, @@ -496,6 +498,7 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -523,6 +526,7 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index bed2b5e10a6f7..53426bebaa70b 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import EmptyDataError import pandas as pd @@ -966,6 +968,7 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..d8c40670afcbd 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -84,6 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 00a81a4f1f385..93e50455fe6a2 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas.util._test_decorators as td @@ -23,7 +25,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 58ebdfe7696b4..07c797467e5e2 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -14,7 +16,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index c5cac5a5caf09..d140cfc941e16 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -11,6 +13,10 @@ from pandas.io.pytables import read_hdf +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..c31b9989ef35e 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( CategoricalIndex, DataFrame, @@ -22,7 +24,10 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_pass_spec_to_storer(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d93de16816725..9daf2a5910a08 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( PY311, is_ci_environment, @@ -32,7 +34,10 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index bc5f046b7fa33..f84a3ebfeb54a 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -22,7 +24,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e4a3ea1fc9db8..a04f02f0e052b 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,7 +28,10 @@ from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_read_missing_key_close_store(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 4ba9787a5a6b9..2397d18b1019e 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -24,7 +26,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_conv_read_write(): diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 0e303d1c890c5..9f403f8293aed 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -24,7 +26,10 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_select_columns_in_where(setup_path): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 82d3052e7f5d6..8a33cccf62fcf 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -31,7 +33,10 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index c5613daf62207..05d630dc0e47c 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -23,6 +25,10 @@ ensure_clean_store, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b71896c77ffb5..493971f9f56ef 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import IS64 from pandas.errors import EmptyDataError import pandas.util._test_decorators as td @@ -16,6 +18,10 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def dirpath(datapath): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3c0208fcc74ec..da998f058471c 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -28,6 +30,10 @@ init_qt_clipboard, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def build_kwargs(sep, excel): kwargs = {} diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e51f86563081b..56707560c2fda 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -152,6 +154,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -347,6 +350,7 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -450,6 +454,7 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -508,6 +513,7 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): @@ -528,6 +534,7 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..25504c7b88fdb 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -137,6 +139,7 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 22a7d3b83a459..bd7f45396d38a 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import ( @@ -149,6 +151,7 @@ def test_path_localpath(self): result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -167,6 +170,7 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a1dec8a2d05b4..7b78cf63f4167 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, date_range, @@ -195,6 +197,7 @@ def test_arrowparquet_options(fsspectest): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -252,6 +255,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.single_cpu @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b337b5b82052..a7ae9c7049702 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -145,6 +147,7 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 607357e709b6e..298b9115b51e4 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -36,6 +38,10 @@ from pandas.io.common import file_path_to_url +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture( params=[ diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 2ca11ad1f74e6..26e1412466e7b 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -7,6 +7,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -83,6 +85,7 @@ def stata_responder(df): return bio.getvalue() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a4021311fc963..52d6850483418 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -17,9 +19,12 @@ import pyarrow as pa -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 760a64c8d4c33..72efe989804e4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + using_string_dtype, +) from pandas._config.config import _get_option from pandas.compat import is_platform_windows @@ -52,6 +55,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7068247bbfa8b..792c532fa8032 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat import ( pa_version_under13p0, @@ -61,9 +63,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6bd74faa8a3db..3c5e843e2e97b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -345,6 +347,7 @@ def test_write_dta6(self, datapath): check_index_type=False, ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( @@ -1150,6 +1153,7 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1240,6 +1244,7 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1543,6 +1548,7 @@ def test_inf(self, infval): with tm.ensure_clean() as path: df.to_stata(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1578,6 +1584,7 @@ def test_value_labels_iterator(self, write_index): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self): # GH 17328 df = DataFrame( @@ -1611,6 +1618,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_writer_117(self): original = DataFrame( data=[ @@ -1717,6 +1725,7 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 @@ -1735,6 +1744,7 @@ def test_nonfile_writing(self, version): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( @@ -1767,6 +1777,7 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_string_strl(self): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] @@ -1864,6 +1875,7 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) @@ -2131,6 +2143,7 @@ def test_iterator_errors(datapath, chunksize): pass +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 900734e9f0fdf..35beda37acf51 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -2005,6 +2007,7 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..b2d96cb1d9133 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,6 +4,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -83,6 +85,7 @@ def read_xml_iterparse(data, **kwargs): # DTYPE +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -208,6 +211,7 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 30ec0d0affaa3..fb457f20f7a48 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -1441,6 +1443,7 @@ def test_mode_numerical_nan(self, dropna, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2, expected3", [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], @@ -1468,6 +1471,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): expected3 = Series(expected3) tm.assert_series_equal(result, expected3) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2", [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], @@ -1605,6 +1609,7 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 550523a432a89..32567b4300152 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -492,6 +494,7 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9e34d02091e69..2a52d3060e4b9 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -44,6 +46,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index a2e22ea73fd86..0865e3cfa8149 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -4,6 +4,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -3081,6 +3083,7 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_merge_datatype_error_raises(self, using_infer_string): if using_infer_string: msg = "incompatible merge keys" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f9a03222c8057..b12438b6327ad 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -361,6 +363,7 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 272c5b3403293..cbe2c9b931ee3 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -81,6 +83,7 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -97,6 +100,7 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -177,6 +181,7 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -204,6 +209,7 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -233,6 +239,7 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -357,6 +364,7 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): multi.melt(["A"], ["F"], col_level=0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -1197,6 +1205,7 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", ["O", "string"]) def test_missing_stubname(self, dtype): # GH46044 @@ -1222,6 +1231,7 @@ def test_missing_stubname(self, dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7b27d19483bd2..9aa13d59a586b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1081,6 +1081,7 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -2524,6 +2525,7 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [object, "string"]) def test_pivot_integer_bug(self, dtype): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..1d5d16f39e648 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -122,6 +124,7 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 34465a7c12c18..f0803ac2f2a30 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,6 +11,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.errors import SettingWithCopyError @@ -526,6 +528,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -568,6 +571,7 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -598,6 +602,7 @@ def test_strftime_period_days(self, using_infer_string): expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -630,6 +635,7 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index f4992b758af74..a26e541732d36 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( @@ -268,6 +270,7 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert (string_series[10:20] == 0).all() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index ed681563f6fcd..fb8e5c31929b2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -561,6 +563,7 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) @@ -570,6 +573,7 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() @@ -873,6 +877,7 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -885,6 +890,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 29dd704f6efa9..8fac40fe5fb25 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -140,6 +142,7 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, plus", [ diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 850740fac907d..c6727e023e786 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -760,6 +760,7 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 1c17013d621c7..999dd90d337d9 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import Series import pandas._testing as tm @@ -24,6 +26,7 @@ def read_csv(self, path, **kwargs): return out + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 3c70e839c8e20..8569e0f49716a 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -134,6 +136,7 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b40e2e99dae2e..1ffc9ddca5adf 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -499,6 +501,7 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d9c94e871bd4b..197ef47759bf3 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -360,6 +362,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 718d1b3ee2e83..f3a7ba2607f4a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -1701,6 +1703,7 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ @@ -1740,6 +1743,7 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ From 6882ef90d2b5f80173ede725fd89512601d9b31b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jul 2024 19:07:20 +0200 Subject: [PATCH 07/52] TST (string dtype): follow-up on GH-59329 fixing new xfails (#59352) * TST (string dtype): follow-up on GH-59329 fixing new xfails * add missing strict --- pandas/_testing/asserters.py | 10 ++++++++-- .../tests/arrays/interval/test_interval_pyarrow.py | 3 +++ pandas/tests/arrays/masked/test_arrow_compat.py | 12 +++++++++--- pandas/tests/arrays/masked/test_function.py | 3 --- pandas/tests/arrays/period/test_arrow_compat.py | 4 ++++ pandas/tests/arrays/string_/test_string.py | 4 ++++ pandas/tests/arrays/test_array.py | 3 +++ pandas/tests/dtypes/test_common.py | 3 +++ pandas/tests/frame/methods/test_astype.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 1 + pandas/tests/groupby/test_apply.py | 3 +++ pandas/tests/indexes/base_class/test_formats.py | 1 + pandas/tests/indexes/multi/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/io/excel/test_readers.py | 4 +++- pandas/tests/io/json/test_json_table_schema.py | 6 ++++++ pandas/tests/io/json/test_pandas.py | 2 ++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_upcast.py | 2 +- pandas/tests/io/parser/usecols/test_usecols_basic.py | 3 +++ pandas/tests/io/test_feather.py | 10 ++++++---- pandas/tests/io/test_fsspec.py | 1 + pandas/tests/reshape/test_get_dummies.py | 3 +++ pandas/tests/series/methods/test_convert_dtypes.py | 3 +++ pandas/tests/test_downstream.py | 3 +++ 25 files changed, 78 insertions(+), 15 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 41d2a7344a4ed..e79e353137152 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -593,13 +593,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..be87d5d3ef7ba 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -80,6 +82,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 7a89656bd5aa0..31765165f5f16 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,12 +1,18 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] + pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 81338bca460a6..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -60,7 +58,6 @@ def test_tolist(data): tm.assert_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..ff86b696c8403 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -77,6 +79,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -96,6 +99,7 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3ef7862d739cd..2663f3d7c0595 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -513,6 +515,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -541,6 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks( dtype, string_storage2, request, using_infer_string diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 96263f498935b..76b42b643ee69 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -275,6 +277,7 @@ def test_array_copy(): cet = pytz.timezone("CET") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..982156d7a9b1d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -128,6 +130,7 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 5a1e3cd786f84..b6510f384fabe 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -757,6 +759,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 4e32dc0cb3f98..0407388d61f51 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2124,6 +2124,7 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_col_index_dtype(): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0ddacfab8c102..4972a6b3afa17 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -940,6 +942,7 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 955e3be107f75..b2f345e5e6f77 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,6 +9,7 @@ class TestIndexRendering: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0abb56ecf9de7..31a5d2fb906eb 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -758,6 +760,7 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 0e6722ff50d1e..a2256322d968b 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -232,6 +232,7 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a83176cfe28f8..1b79b4bff1cea 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -660,7 +660,9 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") + @pytest.mark.xfail( + using_string_dtype(), reason="infer_string takes precedence", strict=False + ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index cc101bb9c8b6d..53e819ac5eaff 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -24,6 +26,10 @@ set_default_names, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def df_schema(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 99c45c61fc8a4..a1d2e93e7c523 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1479,6 +1479,7 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @@ -1491,6 +1492,7 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 9c1ae6f9b4236..da17999bba4ca 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -463,7 +463,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index d8c40670afcbd..01e576ba40f26 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -86,7 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 767fba666e417..24937de163662 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError from pandas import ( @@ -545,6 +547,7 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index bd7f45396d38a..d169fab3f1832 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -13,9 +13,12 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] pa = pytest.importorskip("pyarrow") @@ -170,7 +173,6 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 7b78cf63f4167..65f4156cedf49 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -277,6 +277,7 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 31260e4dcb7d2..3d9b3a6d1c7a2 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -214,6 +216,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_dummies_string_dtype(self, df, using_infer_string): # GH44965 df = df[["A", "B"]] diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index b0a920ba02cad..46fed9032c13d 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -181,6 +183,7 @@ def test_cases(request): class TestSeriesConvertDtypes: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..bf88da04b73ff 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -165,6 +167,7 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") From 99ebd18aea672a01a07089f187089e7bf40a9a97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 19:24:33 +0200 Subject: [PATCH 08/52] TST (string dtype): change any_string_dtype fixture to use actual dtype instances (#59345) * TST (string dtype): change any_string_dtype fixture to use actual dtype instances * avoid pyarrow import error during test collection * fix dtype equality in case pyarrow is not installed * keep using mode.string_storage as default for NA variant + more xfails * fix test_series_string_inference_storage_definition * remove no longer necessary xfails --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/conftest.py | 29 +++++--- pandas/core/arrays/string_.py | 6 +- .../arrays/categorical/test_constructors.py | 1 - pandas/tests/copy_view/test_astype.py | 2 - pandas/tests/dtypes/test_common.py | 3 - pandas/tests/io/parser/test_index_col.py | 3 + pandas/tests/series/test_constructors.py | 2 +- pandas/tests/strings/__init__.py | 10 ++- pandas/tests/strings/test_find_replace.py | 70 ++++++++++++++----- pandas/tests/strings/test_split_partition.py | 4 +- pandas/tests/strings/test_strings.py | 32 ++++++--- 11 files changed, 115 insertions(+), 47 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 10134c90f8eeb..a502152780a27 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1306,20 +1306,33 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c40f5b8f58d9e..c4f208027c9da 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -124,7 +124,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype(): + if using_string_dtype() and na_value is not libmissing.NA: storage = "pyarrow" else: storage = get_option("mode.string_storage") @@ -162,7 +162,9 @@ def __eq__(self, other: object) -> bool: return True try: other = self.construct_from_string(other) - except TypeError: + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" return False if isinstance(other, type(self)): return self.storage == other.storage and self.na_value is other.na_value diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 60d78a906b528..6813683cb5219 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -743,7 +743,6 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 14fd8fb5f911e..514ee6410ecf1 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -98,7 +98,6 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -116,7 +115,6 @@ def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 982156d7a9b1d..c34c97b6e4f04 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -130,7 +128,6 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ba15d061b2deb..6dbfed2b6ae83 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -342,6 +344,7 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 391c9361080d8..0c39cead78baf 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2138,7 +2138,7 @@ def test_series_string_inference_storage_definition(self): # returning the NA string dtype, so expected is changed from # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow]") + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e94f656fc9823..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,7 +2,15 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index cd4707ac405de..df490297e2a5c 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -34,7 +34,9 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -53,7 +55,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,14 +84,18 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -172,7 +180,9 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -213,7 +223,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -231,7 +243,9 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -641,7 +655,9 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -696,12 +712,16 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -709,7 +729,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -725,7 +747,9 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -734,7 +758,9 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -744,14 +770,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -823,7 +853,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -875,7 +907,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 9ff1fc0e13ae9..423993e881b98 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -384,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f662dfd7e2b14..015df18221b40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -207,7 +209,9 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -232,7 +236,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -252,7 +258,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -283,7 +291,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -312,7 +322,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -353,7 +365,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From 15660424000fb288c923fec8a2e0f8f4750ffaf4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 20:25:49 +0200 Subject: [PATCH 09/52] TST (string dtype): remove usage of arrow_string_storage fixture (#59368) * TST (string dtype): remove usage of arrow_string_storage fixture * fixup --- pandas/tests/arrays/string_/test_string.py | 16 ++++++++-------- pandas/tests/arrays/string_/test_string_arrow.py | 12 ++++++------ pandas/tests/extension/test_string.py | 12 ++++++------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2663f3d7c0595..1c55c1d8f3e2e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -165,8 +165,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -464,8 +464,8 @@ def test_min_max(method, skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -479,7 +479,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -492,7 +492,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -643,10 +643,10 @@ def test_value_counts_sort_false(dtype): tm.assert_series_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 06013c3d11664..dceb93364d505 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -48,18 +48,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5a72b2244d2bf..895640d9fbeaa 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -115,8 +115,8 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -124,13 +124,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) From 1d77d0eee5479397d2f96aad3310b626723bc505 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 22:44:18 +0200 Subject: [PATCH 10/52] TST (string dtype): replace string_storage fixture with explicit storage/na_value keyword arguments for dtype creation (#59375) --- pandas/conftest.py | 18 ++++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 7 ++++--- pandas/tests/extension/test_string.py | 5 +++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a502152780a27..39d4e25e4cbfd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1262,6 +1262,24 @@ def string_storage(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ] +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan + """ + return request.param + + @pytest.fixture( params=[ "numpy_nullable", diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1c55c1d8f3e2e..b1c5f4338a4ed 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -22,9 +22,10 @@ @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 895640d9fbeaa..1102d9d941663 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -58,8 +58,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture From 2465a6d40b582f6a61ba48371bd0d1e7823d9f6b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 17:55:37 +0200 Subject: [PATCH 11/52] String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (#59376) * String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation --- pandas/conftest.py | 2 -- pandas/core/config_init.py | 22 +++++++++++-- pandas/tests/arrays/string_/test_string.py | 32 +++++-------------- .../tests/arrays/string_/test_string_arrow.py | 10 +++--- pandas/tests/frame/methods/test_astype.py | 9 ++++++ .../frame/methods/test_convert_dtypes.py | 5 ++- pandas/tests/io/conftest.py | 16 ---------- 7 files changed, 45 insertions(+), 51 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 39d4e25e4cbfd..78cdc2ac5a2bb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1248,7 +1248,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1257,7 +1256,6 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' """ return request.param diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a6625d99eaa71..4cd7e50f0ec50 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,7 +12,10 @@ from __future__ import annotations import os -from typing import Callable +from typing import ( + Any, + Callable, +) import pandas._config.config as cf from pandas._config.config import ( @@ -506,12 +509,27 @@ def use_inf_as_na_cb(key) -> None: ``future.infer_string`` is set to True. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b1c5f4338a4ed..6bf20b6fcd5f7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -516,19 +516,12 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -536,30 +529,21 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -569,10 +553,10 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index dceb93364d505..d38b728aaf120 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,16 +27,18 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) - if string_storage == "pyarrow_numpy": + if using_infer_string and string_storage == "python": + # python string storage with na_value=NaN is not yet implemented request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + dtype = StringDtype( + string_storage, na_value=np.nan if using_infer_string else pd.NA + ) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b6510f384fabe..ea9cc22d93758 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -912,3 +912,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string", copy=False) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 91fa81b5bee2e..59779234b46d9 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -10,6 +10,8 @@ class TestConvertDtypes: + # TODO convert_dtypes should not use NaN variant of string dtype, but always NA + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) @@ -18,9 +20,6 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" - df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..bdefadf3dbec0 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -224,19 +224,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param From 35ebe683cffdf117ee0db7b4dd8b99b04dd4b5a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 6 Aug 2024 19:06:22 +0200 Subject: [PATCH 12/52] API/TST: expand tests for string any/all reduction + fix pyarrow-based implementation (#59414) --- pandas/tests/reductions/test_reductions.py | 51 +++++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fb457f20f7a48..cb9fd9e8da0df 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1091,25 +1091,62 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() - def test_any_all_pyarrow_string(self): + def test_any_all_string_dtype(self, any_string_dtype): # GH#54591 - pytest.importorskip("pyarrow") - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + if ( + isinstance(any_string_dtype, pd.StringDtype) + and any_string_dtype.na_value is pd.NA + ): + # the nullable string dtype currently still raise an error + # https://github.com/pandas-dev/pandas/issues/51939 + ser = Series(["a", "b"], dtype=any_string_dtype) + with pytest.raises(TypeError): + ser.any() + with pytest.raises(TypeError): + ser.all() + return + + ser = Series(["", "a"], dtype=any_string_dtype) assert ser.any() assert not ser.all() + assert ser.any(skipna=False) + assert not ser.all(skipna=False) - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, "a"], dtype=any_string_dtype) assert ser.any() assert ser.all() - assert not ser.all(skipna=False) + assert ser.any(skipna=False) + assert ser.all(skipna=False) # NaN is considered truthy - ser = Series([None, ""], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, ""], dtype=any_string_dtype) assert not ser.any() assert not ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert not ser.all(skipna=False) - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser = Series(["a", "b"], dtype=any_string_dtype) assert ser.any() assert ser.all() + assert ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert not ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([""], dtype=any_string_dtype) + assert not ser.any() + assert not ser.all() + assert not ser.any(skipna=False) + assert not ser.all(skipna=False) + + ser = Series([np.nan], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert ser.all(skipna=False) # NaN is considered truthy def test_timedelta64_analytics(self): # index min/max From 463fd91aecaa5083278a78d209790fb91f2daa25 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:15:25 -0400 Subject: [PATCH 13/52] String dtype: implement object-dtype based StringArray variant with NumPy semantics (#58451) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/asserters.py | 18 ++ pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/conftest.py | 4 + pandas/core/arrays/string_.py | 183 +++++++++++++++++++-- pandas/core/construction.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/io/_util.py | 4 +- pandas/io/pytables.py | 7 +- pandas/tests/arrays/string_/test_string.py | 22 ++- pandas/tests/series/test_constructors.py | 26 ++- pandas/tests/strings/test_find_replace.py | 2 +- 14 files changed, 232 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1222b33aac3c1..5d8a04664b0e4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2728,7 +2728,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e79e353137152..a1f9844669c8c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -811,6 +811,24 @@ def assert_extension_array_equal( left_na, right_na, obj=f"{obj} NA mask", index_values=index_values ) + # Specifically for StringArrayNumpySemantics, validate here we have a valid array + if ( + isinstance(left.dtype, StringDtype) + and left.dtype.storage == "python" + and left.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + if ( + isinstance(right.dtype, StringDtype) + and right.dtype.storage == "python" + and right.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) if check_exact: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ada6d705172f..38fb0188df5ff 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -25,6 +25,7 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, @@ -190,6 +191,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "HAS_PYARROW", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 457d26766520d..7fa197c4a9824 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + HAS_PYARROW = True except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -27,3 +28,4 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 78cdc2ac5a2bb..433ea7275223d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1265,6 +1265,7 @@ def string_storage(request): ("python", pd.NA), pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), ] ) def string_dtype_arguments(request): @@ -1326,12 +1327,14 @@ def object_dtype(request): ("python", pd.NA), pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), ], ids=[ "string=object", "string=string[python]", "string=string[pyarrow]", "string=str[pyarrow]", + "string=str[python]", ], ) def any_string_dtype(request): @@ -1341,6 +1344,7 @@ def any_string_dtype(request): * 'string[python]' (NA variant) * 'string[pyarrow]' (NA variant) * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) """ if isinstance(request.param, np.dtype): return request.param diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c4f208027c9da..6a3cd50b9c288 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,12 @@ from __future__ import annotations +import operator from typing import ( TYPE_CHECKING, + Any, ClassVar, Literal, + cast, ) import numpy as np @@ -19,7 +22,10 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under10p1, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -37,7 +43,10 @@ pandas_dtype, ) -from pandas.core import ops +from pandas.core import ( + nanops, + ops, +) from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -125,7 +134,10 @@ def __init__( # infer defaults if storage is None: if using_string_dtype() and na_value is not libmissing.NA: - storage = "pyarrow" + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") @@ -243,10 +255,12 @@ def construct_array_type( # type: ignore[override] ArrowStringArrayNumpySemantics, ) - if self.storage == "python": + if self.storage == "python" and self._na_value is libmissing.NA: return StringArray elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray + elif self.storage == "python": + return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics @@ -282,7 +296,7 @@ def __from_arrow__( # convert chunk by chunk to numpy and concatenate then, to avoid # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = ensure_string_array(arr, na_value=self.na_value) results.append(arr) if len(chunks) == 0: @@ -292,11 +306,7 @@ def __from_arrow__( # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) - NDArrayBacked.__init__( - new_string_array, - arr, - StringDtype(storage="python"), - ) + NDArrayBacked.__init__(new_string_array, arr, self) return new_string_array @@ -404,6 +414,8 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" + _storage = "python" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -411,7 +423,11 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(storage=self._storage, na_value=self._na_value), + ) def _validate(self): """Validate that we only store NA or strings.""" @@ -429,20 +445,36 @@ def _validate(self): else: lib.convert_nans_to_NA(self._ndarray) + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + if isna(value): + return self.dtype.na_value + elif not isinstance(value, str): + raise TypeError( + f"Cannot set non-string value '{value}' into a string array." + ) + return value + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" + else: + if using_string_dtype(): + dtype = StringDtype(storage="python", na_value=np.nan) + else: + dtype = StringDtype(storage="python") from pandas.core.arrays.masked import BaseMaskedArray + na_value = dtype.na_value if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA + result[na_values] = na_value else: if lib.is_pyarrow_array(scalars): @@ -451,12 +483,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + NDArrayBacked.__init__(new_string_array, result, dtype) return new_string_array @@ -506,7 +538,7 @@ def __setitem__(self, key, value) -> None: # validate new items if scalar_value: if isna(value): - value = libmissing.NA + value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( f"Cannot set non-string value '{value}' into a StringArray." @@ -520,7 +552,7 @@ def __setitem__(self, key, value) -> None: mask = isna(value) if mask.any(): value = value.copy() - value[isna(value)] = libmissing.NA + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -633,9 +665,9 @@ def _cmp_method(self, other, op): if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA + result[mask] = self.dtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return self._from_backing_data(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -704,3 +736,118 @@ def _str_map( # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + + +class StringArrayNumpySemantics(StringArray): + _storage = "python" + _na_value = np.nan + + def _validate(self) -> None: + """Validate that we only store NaN or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN" + ) + if self._ndarray.dtype != "object": + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + # TODO validate or force NA/None to NaN + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype is None: + dtype = StringDtype(storage="python", na_value=np.nan) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + + def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: + # need to override NumpyExtensionArray._from_backing_data to ensure + # we always preserve the dtype + return NDArrayBacked._from_backing_data(self, arr) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + # the masked_reductions use pd.NA + if result is libmissing.NA: + return np.nan + return super()._wrap_reduction_result(axis, result) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas.core.algorithms import value_counts_internal as value_counts + + result = value_counts(self._ndarray, sort=False, dropna=dropna) + result.index = result.index.astype(self.dtype) + return result + + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 748b9e4947bec..5bccca9cfbd47 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -569,7 +569,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -606,7 +606,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5d4e56cd8f800..9580ab1b520e0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 144416fc11691..f3dbacc02bec9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -376,7 +376,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 2dc3b0e6c80ef..68fcfcf65e0c2 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -31,6 +31,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), - pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.string(): pd.StringDtype(na_value=np.nan), + pa.large_string(): pd.StringDtype(na_value=np.nan), }.get diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 12bb93a63f850..2e38303caa354 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -76,6 +76,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, concat, isna, @@ -3225,7 +3226,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - result = result.astype("string[pyarrow_numpy]") + result = result.astype(StringDtype(na_value=np.nan)) return result def write(self, obj, **kwargs) -> None: @@ -3294,7 +3295,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) if len(dfs) > 0: @@ -4685,7 +4686,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6bf20b6fcd5f7..b51b01c2b5168 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -15,6 +15,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, @@ -75,6 +76,9 @@ def test_repr(dtype): elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + elif dtype.storage == "python" and dtype.na_value is np.nan: + arr_name = "StringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -90,14 +94,14 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if cls is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -339,6 +343,8 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" + elif cls is StringArrayNumpySemantics: + msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -348,7 +354,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -389,6 +395,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + elif cls is StringArrayNumpySemantics: + expected = cls(nan_arr) else: expected = cls(na_arr) @@ -671,7 +679,11 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - expected = pd.Series([True, False, True]) + if dtype.storage == "python" and dtype.na_value is np.nan: + # TODO(infer_string) we should make this consistent + expected = pd.Series([True, False, False]) + else: + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -695,7 +707,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0c39cead78baf..0aaa8ddcfda0c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -2094,11 +2095,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) expected = Series(["a", 1], dtype="object") @@ -2109,35 +2109,33 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_scalar(self): # GH#54430 - pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): # https://github.com/pandas-dev/pandas/issues/54793 # but after PDEP-14 (string dtype), it was decided to keep dtype="string" # returning the NA string dtype, so expected is changed from - # "string[pyarrow_numpy]" to "string[pyarrow]" - pytest.importorskip("pyarrow") + # "string[pyarrow_numpy]" to "string[python]" expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") @@ -2153,10 +2151,10 @@ def test_series_constructor_infer_string_scalar(self): def test_series_string_inference_na_first(self): # GH#55655 - pytest.importorskip("pyarrow") - expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) def test_inference_on_pandas_objects(self): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index df490297e2a5c..2d7c9754ee319 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -236,7 +236,7 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": + elif any_string_dtype.na_value is np.nan: expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") From 397cb09f48b78444e0bd2d2982d0b8f7d2ada526 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:16:06 -0400 Subject: [PATCH 14/52] REF (string dtype): de-duplicate _str_map methods (#59443) * REF: de-duplicate _str_map methods * mypy fixup --- pandas/core/arrays/string_.py | 138 ++++++++++++++++------------- pandas/core/arrays/string_arrow.py | 117 ++++++++++-------------- 2 files changed, 124 insertions(+), 131 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6a3cd50b9c288..3cbacec9d411d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -315,6 +315,8 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ + dtype: StringDtype + @doc(ExtensionArray.tolist) def tolist(self): if self.ndim > 1: @@ -328,6 +330,37 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _str_map_str_or_object( + self, + dtype, + na_value, + arr: np.ndarray, + f, + mask: npt.NDArray[np.bool_], + convert: bool, + ): + # _str_map helper for case where dtype is either string dtype or object + if is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + if self.dtype.storage == "pyarrow": + import pyarrow as pa + + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + # error: Too many arguments for "BaseStringArray" + return type(self)(result) # type: ignore[call-arg] + + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -682,9 +715,53 @@ def _cmp_method(self, other, op): # base class "NumpyExtensionArray" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + from pandas.arrays import BooleanArray if dtype is None: @@ -724,18 +801,8 @@ def _str_map( return constructor(result, mask) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) class StringArrayNumpySemantics(StringArray): @@ -802,52 +869,3 @@ def value_counts(self, dropna: bool = True) -> Series: # ------------------------------------------------------------------------ # String methods interface _str_na_value = np.nan - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - convert = convert and not np.all(mask) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - na_value_is_na = isna(na_value) - if na_value_is_na: - if is_integer_dtype(dtype): - na_value = 0 - else: - na_value = True - - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - if na_value_is_na and mask.any(): - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") - result[mask] = np.nan - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 94f6f9064885e..607f6f7e4246a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -7,6 +7,7 @@ TYPE_CHECKING, Callable, Union, + cast, ) import warnings @@ -25,9 +26,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_integer_dtype, - is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -284,9 +283,53 @@ def _data(self): # base class "ObjectStringArrayMixin" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + + dtype = np.dtype(cast(type, dtype)) + if mask.any(): + # numpy int/bool dtypes cannot hold NaNs so we must convert to + # float64 for int (to match maybe_convert_objects) or + # object for bool (again to match maybe_convert_objects) + if is_integer_dtype(dtype): + dtype = np.dtype("float64") + else: + dtype = np.dtype(object) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=dtype, + ) + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. @@ -330,21 +373,8 @@ def _str_map( return constructor(result, mask) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array( - result, mask=mask, type=pa.large_string(), from_pandas=True - ) - return type(self)(result) else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -615,61 +645,6 @@ def __getattribute__(self, item): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array( - result, mask=mask, type=pa.large_string(), from_pandas=True - ) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _convert_int_dtype(self, result): if isinstance(result, pa.Array): result = result.to_numpy(zero_copy_only=False) From dd2680c776c134e9b33c37825473c8d1856714d9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:54:03 -0400 Subject: [PATCH 15/52] String dtype: use 'str' string alias and representation for NaN-variant of the dtype (#59388) --- pandas/_testing/__init__.py | 6 +++- pandas/core/arrays/arrow/array.py | 5 ++- pandas/core/arrays/string_.py | 24 +++++++++---- pandas/core/frame.py | 4 ++- pandas/core/interchange/utils.py | 7 +++- pandas/tests/apply/test_numba.py | 2 +- pandas/tests/apply/test_series_apply.py | 2 +- pandas/tests/arrays/boolean/test_astype.py | 12 +++++-- .../tests/arrays/categorical/test_astype.py | 2 +- pandas/tests/arrays/categorical/test_repr.py | 2 +- pandas/tests/arrays/floating/test_astype.py | 17 ++++++--- pandas/tests/arrays/integer/test_dtypes.py | 17 ++++++--- .../arrays/interval/test_interval_pyarrow.py | 3 -- .../tests/arrays/period/test_arrow_compat.py | 4 --- pandas/tests/arrays/string_/test_string.py | 35 +++++++++++-------- .../tests/arrays/string_/test_string_arrow.py | 6 ++-- pandas/tests/arrays/test_datetimelike.py | 7 ++-- pandas/tests/dtypes/test_common.py | 19 ++++++++++ pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/extension/test_string.py | 14 ++++++++ pandas/tests/frame/indexing/test_indexing.py | 8 ++--- pandas/tests/frame/indexing/test_set_value.py | 2 +- pandas/tests/frame/methods/test_astype.py | 6 ++-- .../frame/methods/test_get_numeric_data.py | 4 ++- pandas/tests/frame/methods/test_nlargest.py | 2 +- .../tests/frame/methods/test_reset_index.py | 4 +-- .../tests/frame/methods/test_select_dtypes.py | 19 ++++++++-- pandas/tests/frame/methods/test_to_csv.py | 5 +-- pandas/tests/frame/test_block_internals.py | 4 ++- pandas/tests/frame/test_constructors.py | 31 +++++++++++----- pandas/tests/frame/test_stack_unstack.py | 6 +++- pandas/tests/generic/test_to_xarray.py | 4 +-- pandas/tests/groupby/test_apply.py | 17 +++------ pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_groupby.py | 7 ++-- pandas/tests/groupby/test_numeric_only.py | 2 ++ .../tests/indexes/base_class/test_formats.py | 1 - .../tests/indexes/multi/test_constructors.py | 2 +- pandas/tests/indexes/multi/test_get_set.py | 6 ++-- pandas/tests/indexes/object/test_indexing.py | 2 ++ pandas/tests/indexes/test_base.py | 4 +-- pandas/tests/indexing/multiindex/test_loc.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_loc.py | 15 +++++--- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- .../tests/io/json/test_json_table_schema.py | 6 ++-- pandas/tests/io/json/test_pandas.py | 7 ++-- pandas/tests/io/pytables/test_keys.py | 7 +++- pandas/tests/io/pytables/test_subclass.py | 3 ++ pandas/tests/io/test_common.py | 1 + pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_gcs.py | 1 + pandas/tests/io/xml/test_xml_dtypes.py | 4 --- .../tests/reshape/concat/test_categorical.py | 4 +-- pandas/tests/reshape/concat/test_empty.py | 4 +-- pandas/tests/reshape/concat/test_index.py | 6 ++-- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 10 +++--- pandas/tests/reshape/merge/test_merge_asof.py | 4 +-- pandas/tests/reshape/test_from_dummies.py | 2 +- pandas/tests/reshape/test_get_dummies.py | 2 +- .../series/accessors/test_dt_accessor.py | 2 +- pandas/tests/series/indexing/test_delitem.py | 11 +++--- pandas/tests/series/indexing/test_getitem.py | 4 +-- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/series/methods/test_astype.py | 4 +-- pandas/tests/series/methods/test_map.py | 14 +++----- pandas/tests/series/methods/test_rename.py | 2 +- .../tests/series/methods/test_reset_index.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 7 ++-- pandas/tests/series/test_constructors.py | 11 ++++-- pandas/tests/series/test_formats.py | 4 +-- pandas/tests/test_downstream.py | 3 -- pandas/tests/util/test_assert_frame_equal.py | 4 +-- pandas/tests/util/test_assert_index_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 6 ++-- pandas/tests/window/test_api.py | 2 +- 79 files changed, 306 insertions(+), 192 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 994b351acf42c..10c1c490551fb 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -14,6 +14,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -110,7 +111,10 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = [str, "U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a156042ac0c0e..6c44b7759f0e2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -570,7 +570,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" + ): # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3cbacec9d411d..0929791ded58c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - ClassVar, Literal, cast, ) @@ -114,9 +113,12 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @@ -133,7 +135,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype() and na_value is not libmissing.NA: + if na_value is not libmissing.NA: if HAS_PYARROW: storage = "pyarrow" else: @@ -166,11 +168,19 @@ def __init__( self.storage = storage self._na_value = na_value + def __repr__(self) -> str: + if self._na_value is libmissing.NA: + return f"{self.name}[{self.storage}]" + else: + # TODO add more informative repr + return self.name + def __eq__(self, other: object) -> bool: # we need to override the base class __eq__ because na_value (NA or NaN) # cannot be checked with normal `==` if isinstance(other, str): - if other == self.name: + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: # noqa: PLR1714 return True try: other = self.construct_from_string(other) @@ -227,6 +237,8 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afcd4d014316e..1403fc2ceaaf8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4979,7 +4979,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index fd1c7c9639242..035a1f8abdbc5 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index aee9100702350..6ac0b49f0e4e7 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis): with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index df24fa08f48e1..69f84ca74ab0b 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -244,7 +244,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e448..8c2672218f273 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a2a53af6ab1ad..ee930ac84aaf2 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index e2e5d47f50209..3a2c489920eb0 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -22,7 +22,7 @@ def test_print(self, using_infer_string): if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..ccf644b34051d 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,21 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["0.1", "0.2", ""], dtype="U32") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 8620763988e06..7be00e569b3fe 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -278,12 +278,21 @@ def test_to_numpy_na_raises(dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index be87d5d3ef7ba..ef8701be81e2b 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -82,7 +80,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index ff86b696c8403..431309aca0df2 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -79,7 +77,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -99,7 +96,6 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b51b01c2b5168..1296cc3b5a494 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -65,7 +65,7 @@ def test_repr(dtype): assert repr(df) == expected if dtype.na_value is np.nan: - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected @@ -75,10 +75,10 @@ def test_repr(dtype): expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" elif dtype.storage == "python" and dtype.na_value is np.nan: arr_name = "StringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -502,7 +502,7 @@ def test_fillna_args(dtype): tm.assert_extension_array_equal(res, expected) if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype string" + msg = "Invalid value '1' for dtype str" else: msg = "Cannot set non-string value '1' into a StringArray." with pytest.raises(TypeError, match=msg): @@ -524,7 +524,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -539,14 +539,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is result["a"].dtype.na_value + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -563,9 +566,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d38b728aaf120..8d5c16e448cee 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,8 +28,9 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python": - # python string storage with na_value=NaN is not yet implemented + if using_infer_string and string_storage == "python" and HAS_PYARROW: + # string storage with na_value=NaN always uses pyarrow if available + # -> does not yet honor the option request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 4961123a7ca07..360ab960088ed 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -295,7 +295,9 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, string_storage): + def test_searchsorted_castable_strings( + self, arr1d, box, string_storage, using_infer_string + ): arr = arr1d if box is None: pass @@ -331,7 +333,8 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got string array instead." + "or array of those. Got " + f"{'str' if using_infer_string else 'string'} array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..e0232bb292d6e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -799,3 +799,22 @@ def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 with tm.assert_produces_warning(UserWarning): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + # TODO(infer_string) remove skip if "python" is supported + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + # TODO(infer_string) hardcoded to pyarrow until python is supported + assert result == pd.StringDtype("pyarrow", na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e522d2666a2dc..a4916ed1bbd8a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1062,7 +1062,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1102d9d941663..f800f734ec9d9 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -116,6 +116,20 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + def test_view(self, data): if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 09f359df37dd1..ec00044b84c49 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -337,7 +337,7 @@ def test_setitem( smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -472,13 +472,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -514,7 +514,7 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..3d23e13264911 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ea9cc22d93758..9c27e76de91b2 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -202,7 +202,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -263,9 +263,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c1..6d097e75f6703 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 3ba893501914a..54f2e45488b78 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 44d7bbf57fe0a..8d93c97b6b68a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -664,7 +664,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -697,7 +697,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index d1bee6a3de613..875dca321635f 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,11 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +156,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +192,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +357,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and dtype == "str": + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index bed8b030bc72a..20a8e95f990ec 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -697,10 +697,7 @@ def test_to_csv_interval_index(self, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 9bd61736624ca..0766e927a64a9 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -209,7 +209,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index aab900f6eef47..c9eb2d5ca7be4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -265,7 +265,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -789,7 +789,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1209,7 +1209,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1292,7 +1292,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays @@ -1860,7 +1860,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1882,7 +1887,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1904,7 +1913,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -2124,7 +2137,9 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 75ef348b75deb..2c3e9c1d5e327 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -657,7 +657,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d8401a8b2ae3f..9fe9bca8abdc9 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -52,7 +52,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): # column names are lost expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -81,7 +81,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4972a6b3afa17..d91510d834e6c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -79,7 +77,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -111,10 +109,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): @@ -942,12 +937,11 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -958,8 +952,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1040,7 +1033,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1073dda954563..c70995de7b3b2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -131,7 +131,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4381b36b0b73a..dc1658e9acf3b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1216,7 +1216,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -1225,11 +1225,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -2097,7 +2096,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index ff4685b1e412d..029d322e4fdc3 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -181,6 +181,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): @@ -198,6 +199,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index b2f345e5e6f77..955e3be107f75 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,7 +9,6 @@ class TestIndexRendering: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 8456e6a7acba5..b1180f2d7af14 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -851,7 +851,7 @@ def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = Series( ["int64", exp], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 6eeaeb6711d03..17ca876487330 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..493a5be735d1a 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -170,6 +170,7 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + # TODO(infer_string) parametrize over multiple string dtypes @pytest.mark.parametrize( "dtype", [ @@ -208,6 +209,7 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + # TODO(infer_string) parametrize over multiple string dtypes @td.skip_if_no("pyarrow") def test_slice_locs_negative_step_oob(self): index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7eeb626d91dc8..4d02ec853e0da 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -79,7 +79,7 @@ def test_constructor_copy(self, using_infer_string): assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -160,7 +160,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 5508153322adb..fa5ec63dd32fe 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -588,7 +588,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d2c8454019a5e..908e95accfb0f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -294,7 +294,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 41431c0e2813b..34d827a209dae 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -63,12 +63,17 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + @pytest.mark.parametrize("kind", ["series", "frame"]) def test_loc_getitem_int(self, kind, request): # int label @@ -1460,7 +1465,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1635,7 +1640,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1f..5fcb71d0186a6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -227,7 +227,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ce88bae6e02f2..30c5d3177c5a5 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -626,7 +626,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 7ecddb18a61ec..57091b268a9db 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -766,7 +766,7 @@ def test_to_excel_interval_no_labels(self, path, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) df.to_excel(path, sheet_name="test1") diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 53e819ac5eaff..1e47b3bc38737 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -75,7 +75,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -128,7 +128,7 @@ def test_multiindex(self, df_schema, using_infer_string): "type": "any", "extDtype": "string", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -311,7 +311,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a1d2e93e7c523..cb94111aedffd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -266,7 +266,7 @@ def test_roundtrip_categorical( expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -621,7 +621,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -706,7 +706,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1492,7 +1492,6 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a..7d0802dcf2e47 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, HDFStore, @@ -13,7 +15,10 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 03622faa2b5a8..bbe1cd77e0d9f 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -17,6 +19,7 @@ class TestHDFStoreSubclass: # GH 33748 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 56707560c2fda..75ecd1d929d58 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -377,6 +377,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 65f4156cedf49..19b60e17d3a92 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -168,6 +168,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -277,7 +278,6 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a7ae9c7049702..96bc0326b23ab 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -197,6 +197,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index b2d96cb1d9133..a85576ff13f5c 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,8 +4,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -85,7 +83,6 @@ def read_xml_iterparse(data, **kwargs): # DTYPE -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -211,7 +208,6 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbd..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 30ef0a934157b..9560087615123 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -27,7 +27,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -284,7 +284,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 52bb9fa0f151b..49c94168d203e 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -452,9 +452,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -465,7 +463,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index db5a0437a14f0..91f0cf6c31085 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -156,7 +156,7 @@ def test_join_on(self, target_source, infer_string): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ed49f3b758cc5..8a9fe9f3e2cfd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1877,7 +1877,7 @@ def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], @@ -1889,7 +1889,7 @@ def test_basic(self, left, right, using_infer_string): # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2003,7 +2003,7 @@ def test_other_columns(self, left, right, using_infer_string): merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2040,7 +2040,7 @@ def test_dtype_on_merged_different( merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 0865e3cfa8149..11e29f4e10dc4 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3183,7 +3183,7 @@ def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3234,7 +3234,7 @@ def test_by_mixed_tz_aware(self, using_infer_string): ) expected["value_y"] = np.array([np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index b12438b6327ad..6009b263a83c5 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -336,7 +336,7 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 3d9b3a6d1c7a2..2c17b7f6a5a47 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -122,7 +122,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index f0803ac2f2a30..0dd2c227d6aa7 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -599,7 +599,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_index_equal(result, expected) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b..7440ef2692c47 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 596a225c288b8..9783dcd2fea07 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -363,9 +363,7 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index fb8e5c31929b2..e2c27fe5575db 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -624,7 +624,7 @@ def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4c8028e74ee55..ef0757ffe4aa8 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -538,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 251d4063008b9..ac489b2579e05 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -497,7 +493,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 119654bd19b3f..a8f3862d39f07 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 48e2608a1032a..fa571fa126b38 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 999dd90d337d9..efb249fdedf3d 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -177,9 +177,6 @@ def test_to_csv_interval_index(self, using_infer_string): result = self.read_csv(path, index_col=0) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = s.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected = s + expected.index = expected.index.astype("str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0aaa8ddcfda0c..6efe0bcb8b45d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -167,7 +167,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -1469,7 +1469,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1481,7 +1481,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -2141,6 +2141,11 @@ def test_series_string_inference_storage_definition(self): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 4939f3221d268..77e77a9337d63 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -323,7 +323,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -341,7 +341,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index bf88da04b73ff..51ce73ef54300 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -167,7 +165,6 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 79132591b15b3..dd5218ab9404f 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -111,7 +111,7 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.index are different @@ -131,7 +131,7 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string) @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.columns are different diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e..ab52d6c8e9f39 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1878e7d838064..0d56885a1cb84 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -221,9 +221,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -258,7 +258,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index fe2da210c6fe9..948565be36b5b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum() From a9fd6f112a8d661b3217666d92ec1a312b3a0336 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Aug 2024 17:25:05 +0200 Subject: [PATCH 16/52] String dtype: fix alignment sorting in case of python storage (#59448) * String dtype: fix alignment sorting in case of python storage * add test --- pandas/core/indexes/base.py | 5 ++++- pandas/tests/series/methods/test_align.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11d2436b0e095..825316585c03c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5072,7 +5072,10 @@ def _can_use_libjoin(self) -> bool: return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index cb60cd2e5bcf3..f332aad0c05f9 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -211,6 +211,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) From bf7fb018694fb0dc4bf1e65a4aeb6f2320bfbab4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:17:31 -0400 Subject: [PATCH 17/52] TST (string dtype): add test build with future strings enabled without pyarrow (#59437) * TST (string dtype): add test build with future strings enabled without pyarrow * ensure the build doesn't override the default ones * uninstall -> remove * avoid jobs with same env being cancelled * use different python version for both future jobs * add some xfails * fixup xfails * less strict --- .github/actions/setup-conda/action.yml | 6 +++++ .github/workflows/unit-tests.yml | 7 +++++- pandas/tests/apply/test_frame_apply.py | 5 ++++ pandas/tests/apply/test_invalid_arg.py | 11 +++++++++ pandas/tests/apply/test_numba.py | 1 + pandas/tests/arithmetic/test_object.py | 6 +++++ .../tests/arrays/boolean/test_arithmetic.py | 7 ++++++ .../arrays/categorical/test_analytics.py | 9 +++++-- .../arrays/categorical/test_constructors.py | 6 ++++- pandas/tests/arrays/integer/test_reduction.py | 9 ++++--- pandas/tests/base/test_conversion.py | 12 +++++++++- pandas/tests/copy_view/test_astype.py | 5 ++-- pandas/tests/copy_view/test_functions.py | 12 ++++++---- pandas/tests/copy_view/test_interp_fillna.py | 4 +++- pandas/tests/copy_view/test_methods.py | 8 +++++-- pandas/tests/copy_view/test_replace.py | 6 +++-- pandas/tests/extension/base/ops.py | 24 +++++++++++++++++++ pandas/tests/extension/test_categorical.py | 2 ++ pandas/tests/extension/test_numpy.py | 7 ++++++ pandas/tests/frame/indexing/test_indexing.py | 5 +++- pandas/tests/frame/indexing/test_where.py | 8 +++++++ pandas/tests/frame/methods/test_info.py | 3 ++- pandas/tests/frame/methods/test_rank.py | 14 ++++++++++- .../tests/frame/methods/test_value_counts.py | 7 ++++++ pandas/tests/frame/test_api.py | 6 ++++- pandas/tests/frame/test_arithmetic.py | 5 +++- pandas/tests/frame/test_constructors.py | 3 ++- pandas/tests/frame/test_logical_ops.py | 7 ++++++ pandas/tests/frame/test_reductions.py | 11 ++++++--- pandas/tests/frame/test_unary.py | 10 +++++++- .../groupby/methods/test_value_counts.py | 4 ++++ pandas/tests/groupby/test_groupby.py | 5 ++++ pandas/tests/groupby/test_groupby_dropna.py | 19 +++++++++++++++ pandas/tests/indexes/object/test_indexing.py | 6 +++++ pandas/tests/indexes/test_base.py | 20 +++++++++++++++- pandas/tests/indexes/test_old_base.py | 6 +++++ pandas/tests/indexing/test_loc.py | 4 ++++ pandas/tests/io/formats/style/test_bar.py | 1 + pandas/tests/io/parser/conftest.py | 22 +++++++++++++++-- pandas/tests/reductions/test_reductions.py | 5 ++++ pandas/tests/series/indexing/test_setitem.py | 6 +++++ pandas/tests/series/test_api.py | 7 ++++++ pandas/tests/series/test_logical_ops.py | 5 ++++ pandas/tests/series/test_reductions.py | 13 ++++++++++ pandas/tests/window/test_rolling.py | 6 +++++ 45 files changed, 322 insertions(+), 33 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index e31fed120267a..4fe901998cbcc 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -16,3 +16,9 @@ runs: condarc-file: ci/.condarc cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3ba61e39316af..eef899173403b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -29,6 +29,7 @@ jobs: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] + pandas_future_infer_string: ["0"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml @@ -86,6 +87,9 @@ jobs: pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" - name: "Future infer strings" + env_file: actions-312.yaml + pandas_future_infer_string: "1" + - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" - name: "Pypy" @@ -114,9 +118,10 @@ jobs: NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }} cancel-in-progress: true services: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index a774ae214e09a..5e0f991d5c406 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1201,6 +1203,9 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b5ad1094f5bf5..1c5b170c8753f 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,6 +12,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -209,6 +212,10 @@ def transform(row): data.apply(transform, axis=1) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -229,6 +236,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_str df.agg(func, axis=axis) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 6ac0b49f0e4e7..6bbe5100e8826 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -104,6 +104,7 @@ def test_numba_nonunique_unsupported(apply_axis): def test_numba_unsupported_dtypes(apply_axis): + pytest.importorskip("pyarrow") f = lambda x: x df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) df["c"] = df["c"].astype("double[pyarrow]") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4b5156d0007bb..899ea1910d055 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,6 +8,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -315,6 +318,9 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_sub_fail(self, using_infer_string): index = pd.Index([str(i) for i in range(10)]) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 0c4fcf149eb20..4dbd8eb9f5ca7 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -3,6 +3,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -90,6 +94,9 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 7c7a236ab83cd..a38814ca43773 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,10 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( Categorical, @@ -298,7 +301,9 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6813683cb5219..8ac479cf8a0a4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -449,7 +451,9 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" + ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index db04862e4ea07..e485c7f79b475 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -102,9 +104,10 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string: - expected = expected.astype("bool") +def test_mixed_reductions(request, op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string and HAS_PYARROW: + # TODO(infer_string) inconsistent result type + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index fe0f1f1454a55..d62599c56e467 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,6 +24,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -218,7 +223,9 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ArrowStringArrayNumpySemantics + expected_type = ( + ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics + ) l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -355,6 +362,9 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 514ee6410ecf1..fb82329d5b50d 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -223,7 +224,7 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -243,7 +244,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes(using_copy_on_write): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 4ec1d023c8ba7..a87baaedb9244 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -14,7 +16,7 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -39,7 +41,7 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -201,7 +203,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -319,7 +321,7 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") @@ -353,7 +355,7 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 338b76cbf1e7a..1f7f8689d0779 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( NA, ArrowDtype, @@ -159,7 +161,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index d870342ef9e29..295d93580f451 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -3,6 +3,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import SettingWithCopyWarning import pandas as pd @@ -952,7 +953,7 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_infer_objects(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -974,6 +975,9 @@ def test_infer_objects(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_infer_objects_no_reference(using_copy_on_write): df = DataFrame( { @@ -1180,7 +1184,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 32da870c6d2e3..bc3edb1f72214 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( Categorical, DataFrame, @@ -66,7 +68,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -354,7 +356,7 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index fad2560265d21..ff9f3cbed64a2 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -7,6 +7,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -140,6 +142,12 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError + # TODO(infer_string) need to remove import of pyarrow + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,6 +157,11 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -158,12 +171,22 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -179,6 +202,7 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 135ea67c924d0..fd291908a4f96 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -148,6 +148,7 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -159,6 +160,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e38144f4c615b..b79b0a98efde4 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -242,6 +244,7 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -249,6 +252,7 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -257,6 +261,7 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -270,6 +275,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -278,6 +284,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index ec00044b84c49..795e6b974ca34 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -12,6 +12,7 @@ from pandas._config import using_string_dtype from pandas._libs import iNaT +from pandas.compat import HAS_PYARROW from pandas.errors import ( InvalidIndexError, PerformanceWarning, @@ -1208,7 +1209,9 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index dfbc3b4ca33ad..f0d868a4cb583 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -983,6 +985,9 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1079,6 +1084,9 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 475632667a87a..4594f725b43d5 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, ) @@ -520,7 +521,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8d7a0b373f5f8..edba971408d04 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,10 +6,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.algos import ( Infinity, NegInfinity, ) +from pandas.compat import HAS_PYARROW from pandas import ( DataFrame, @@ -471,9 +474,18 @@ def test_rank_inf_nans_na_option( ], ) def test_rank_object_first( - self, frame_or_series, na_option, ascending, expected, using_infer_string + self, + request, + frame_or_series, + na_option, + ascending, + expected, + using_infer_string, ): obj = frame_or_series(["foo", "foo", None, "foo"]) + if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) if using_infer_string and isinstance(obj, Series): diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 4136d641ef67f..7670b53f23173 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -132,6 +136,9 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 339800538f47b..f6c7bd1f49b27 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype from pandas._config.config import option_context +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -113,7 +115,9 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed" + ) def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0407388d61f51..6b4efc41aeffa 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,6 +13,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -1562,7 +1563,9 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c9eb2d5ca7be4..86d9dc0c7fbdc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,6 +24,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -327,7 +328,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 16ca3a202f1e0..2684704f86b82 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( CategoricalIndex, DataFrame, @@ -96,6 +100,9 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_ops_invalid(self, using_infer_string): # GH#5808 diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index db15461ba0234..a64f06f6c0521 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,6 +226,7 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -431,6 +432,7 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -534,7 +536,7 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) with pytest.raises( - TypeError, match="unsupported operand type|does not support" + TypeError, match="unsupported operand type|does not support|Cannot perform" ): df.mean() result = df[["A", "C"]].mean() @@ -692,6 +694,7 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -990,7 +993,7 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - msg = "Could not convert|does not support" + msg = "Could not convert|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) @@ -1103,6 +1106,7 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1153,6 +1157,7 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1994,7 +1999,7 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation|Cannot perform"): df.sum() diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 6f7453d0d1655..8e1df679ee1b4 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -43,6 +44,11 @@ def test_neg_object(self, df, expected): tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "df", [ @@ -128,7 +134,9 @@ def test_pos_object(self, df): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index dcc0b39f0006c..51232fac7d6f6 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas import ( @@ -536,6 +537,9 @@ def names_with_nulls_df(nulls_fixture): ) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dc1658e9acf3b..10f45cac1ff66 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,6 +8,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -1743,6 +1744,10 @@ def g(group): tm.assert_series_equal(result, expected) +# TODO harmonize error messages +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) def test_set_group_name(df, grouper, using_infer_string): def f(group): diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9c01e017dd29c..d843a992daee0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,6 +3,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -12,6 +13,9 @@ from pandas.tests.groupby import get_groupby_method_args +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -55,6 +59,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -131,6 +138,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, idx, expected", [ @@ -206,6 +216,9 @@ def test_groupby_dataframe_slice_then_transform(dropna, index): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -287,6 +300,9 @@ def test_groupby_dropna_datetime_like_data( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ @@ -372,6 +388,9 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series): tm.assert_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 493a5be735d1a..add2f3f18b348 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,10 +3,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.missing import ( NA, is_matching_na, ) +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -28,6 +31,9 @@ def test_get_indexer_strings(self, method, expected): tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4d02ec853e0da..cf75f95d17b0a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,7 +8,12 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas._config import using_string_dtype + +from pandas.compat import ( + HAS_PYARROW, + IS64, +) from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -71,6 +76,9 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -338,6 +346,11 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "index", [ @@ -856,6 +869,11 @@ def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index a2256322d968b..8d859a61a2bd5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -9,6 +9,7 @@ from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -249,6 +250,11 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 34d827a209dae..4d2d1e336ef07 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype from pandas._libs import index as libindex +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1454,6 +1455,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index b0e4712e8bb3d..d28c7c566d851 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): def test_style_bar_with_pyarrow_NA_values(): + pytest.importorskip("pyarrow") data = """name,age,test1,test2,teacher Adam,15,95.0,80,Ashby Bob,16,81.0,82,Ashby diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 6d5f870f07206..90f77a7024235 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.compat._optional import VERSIONS from pandas import ( @@ -117,7 +118,15 @@ def csv1(datapath): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] @@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations(): parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index cb9fd9e8da0df..c58db25991510 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -9,6 +9,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( Categorical, @@ -1244,6 +1246,9 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index e2c27fe5575db..0b9b1fc080cfe 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -850,6 +851,11 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_series_where(self, obj, key, expected, warn, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 29d6e2036476e..e53cd753a4192 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -167,6 +171,9 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_inspect_getmembers(self): # GH38782 pytest.importorskip("jinja2") diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 197ef47759bf3..d3a718be47c18 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -148,6 +150,9 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 76353ab25fca6..fcae835e4c3e2 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import Series import pandas._testing as tm @@ -163,6 +167,9 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) @@ -183,6 +190,9 @@ def test_mean_with_convertible_string_raises(using_array_manager, using_infer_st df.mean() +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_dont_convert_j_to_complex(using_array_manager): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) @@ -204,6 +214,9 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): np.mean(df["db"].astype("string").array) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f353a7fa2f0fe..acf636616421f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,7 +6,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( + HAS_PYARROW, IS64, is_platform_arm, is_platform_power, @@ -1420,6 +1423,9 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} From 81850c80b3289dfdcedd656fe99fb4ea086db548 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2024 10:18:41 -0700 Subject: [PATCH 18/52] REF (string dtype): de-duplicate _str_map (2) (#59451) * REF (string): de-duplicate _str_map (2) * mypy fixup --- pandas/core/arrays/string_.py | 179 +++++++++++++++-------------- pandas/core/arrays/string_arrow.py | 56 +-------- 2 files changed, 92 insertions(+), 143 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0929791ded58c..1919fdce12f11 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -342,6 +342,57 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + # GH#55736 + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(cast(type, dtype)), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map_str_or_object( self, dtype, @@ -373,6 +424,45 @@ def _str_map_str_or_object( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -727,95 +817,6 @@ def _cmp_method(self, other, op): # base class "NumpyExtensionArray" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - convert = convert and not np.all(mask) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - na_value_is_na = isna(na_value) - if na_value_is_na: - if is_integer_dtype(dtype): - na_value = 0 - else: - na_value = True - - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - if na_value_is_na and mask.any(): - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") - result[mask] = np.nan - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - from pandas.arrays import BooleanArray - - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - elif dtype == np.dtype("bool"): - na_value = bool(na_value) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - class StringArrayNumpySemantics(StringArray): _storage = "python" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 607f6f7e4246a..4bdbf1f6a606f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -283,6 +283,8 @@ def _data(self): # base class "ObjectStringArrayMixin" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + _str_map = BaseStringArray._str_map + def _str_map_nan_semantics( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): @@ -322,60 +324,6 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. - - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) - - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): From 078c5a01f5310a628ec38191c35150c3b9296a33 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2024 14:13:06 -0700 Subject: [PATCH 19/52] REF (string): de-duplicate str_map_nan_semantics (#59464) REF: de-duplicate str_map_nan_semantics --- pandas/core/arrays/string_.py | 9 ++++--- pandas/core/arrays/string_arrow.py | 42 ------------------------------ 2 files changed, 5 insertions(+), 46 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1919fdce12f11..f2811703cbecf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -391,7 +391,7 @@ def _str_map( return constructor(result, mask) else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) def _str_map_str_or_object( self, @@ -400,7 +400,6 @@ def _str_map_str_or_object( arr: np.ndarray, f, mask: npt.NDArray[np.bool_], - convert: bool, ): # _str_map helper for case where dtype is either string dtype or object if is_string_dtype(dtype) and not is_object_dtype(dtype): @@ -434,7 +433,6 @@ def _str_map_nan_semantics( mask = isna(self) arr = np.asarray(self) - convert = convert and not np.all(mask) if is_integer_dtype(dtype) or is_bool_dtype(dtype): na_value_is_na = isna(na_value) @@ -453,6 +451,9 @@ def _str_map_nan_semantics( dtype=np.dtype(cast(type, dtype)), ) if na_value_is_na and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? if is_integer_dtype(dtype): result = result.astype("float64") else: @@ -461,7 +462,7 @@ def _str_map_nan_semantics( return result else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4bdbf1f6a606f..c643d4fed4b20 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -7,7 +7,6 @@ TYPE_CHECKING, Callable, Union, - cast, ) import warnings @@ -24,8 +23,6 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, is_scalar, pandas_dtype, ) @@ -285,45 +282,6 @@ def _data(self): _str_map = BaseStringArray._str_map - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - - dtype = np.dtype(cast(type, dtype)) - if mask.any(): - # numpy int/bool dtypes cannot hold NaNs so we must convert to - # float64 for int (to match maybe_convert_objects) or - # object for bool (again to match maybe_convert_objects) - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - else: - dtype = np.dtype(object) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=dtype, - ) - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): From fdbd473fa4fd68bf075b7e50c8a4433a42a261ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 19:14:08 +0200 Subject: [PATCH 20/52] BUG (string dtype): convert dictionary input to materialized string array in ArrowStringArray constructor (#59479) --- pandas/core/arrays/string_arrow.py | 16 ++++++++++------ pandas/tests/arrays/string_/test_string_arrow.py | 11 +++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c643d4fed4b20..7119f6321a7ff 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -125,18 +125,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 8d5c16e448cee..6bab04e95de9e 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list(): From 2346acf94a618d39cd4b287e1257e860a3bf3572 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 19:33:35 +0200 Subject: [PATCH 21/52] String dtype: fix convert_dtypes() to convert NaN-string to NA-string (#59470) * String dtype: fix convert_dtypes() to convert NaN-string to NA-string * fix CoW tracking for conversion to python storage strings * remove xfails --- pandas/core/dtypes/cast.py | 10 +++++++++- pandas/core/internals/blocks.py | 9 +++++++-- pandas/tests/frame/methods/test_convert_dtypes.py | 10 +--------- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 -- pandas/tests/series/methods/test_convert_dtypes.py | 6 +++--- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9580ab1b520e0..7a92b7306beea 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1025,6 +1025,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1103,12 +1105,18 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type - from pandas.core.arrays.string_ import StringDtype assert not isinstance(inferred_dtype, str) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2f448bf249a2e..cd1639188b1ad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -657,8 +657,13 @@ def convert( convert_non_numeric=True, ) refs = None - if copy and res_values is values: - res_values = values.copy() + if ( + copy + and res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): + res_values = res_values.copy() elif res_values is values: refs = self.refs diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 59779234b46d9..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,21 +3,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm class TestConvertDtypes: - # TODO convert_dtypes should not use NaN variant of string dtype, but always NA - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index da17999bba4ca..5138dfb61eaac 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 46fed9032c13d..c2cc838619790 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -230,9 +230,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) From 1bd3ce8995e91f51f18182cf7d5221fd9942a42b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Aug 2024 00:28:50 +0200 Subject: [PATCH 22/52] String dtype: honor mode.string_storage option (and change default to None) (#59488) * String dtype: honor mode.string_storage option (and change default to None) * fix test + explicitly test default * use 'auto' instead of None --- pandas/core/arrays/string_.py | 12 ++++++++---- pandas/core/config_init.py | 7 +++---- pandas/tests/arrays/string_/test_string_arrow.py | 10 ++++------ pandas/tests/dtypes/test_common.py | 13 +++++++++---- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f2811703cbecf..c881437ba25af 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -136,12 +136,16 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4cd7e50f0ec50..a1df455eebacf 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -505,13 +505,12 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -526,7 +525,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 6bab04e95de9e..72d672ba8a7d9 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,11 +26,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e0232bb292d6e..ccd30caba5dee 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance(): def test_pandas_dtype_string_dtypes(string_storage): - # TODO(infer_string) remove skip if "python" is supported - pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") - # TODO(infer_string) hardcoded to pyarrow until python is supported - assert result == pd.StringDtype("pyarrow", na_value=np.nan) + assert result == pd.StringDtype(string_storage, na_value=np.nan) with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): From 7e50b160e2d557ecc5cc20934a893e5d074a67e7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2024 15:46:03 -0700 Subject: [PATCH 23/52] BUG (string): ArrowEA comparisons with mismatched types (#59505) * BUG: ArrowEA comparisons with mismatched types * move whatsnew * GH ref --- pandas/core/arrays/arrow/array.py | 8 ++++++- pandas/core/arrays/string_arrow.py | 6 +---- pandas/tests/series/test_logical_ops.py | 31 +++++++++++++++++++++---- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6c44b7759f0e2..46f2cbb2ebeef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -704,7 +704,13 @@ def _cmp_method(self, other, op): if isinstance( other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7119f6321a7ff..00d92958cc8dc 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -37,7 +37,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: @@ -565,10 +564,7 @@ def _convert_int_dtype(self, result): return result def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) + result = super()._cmp_method(other, op) if op == operator.ne: return result.to_numpy(np.bool_, na_value=True) else: diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d3a718be47c18..ff21427b71cf9 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ from pandas.compat import HAS_PYARROW from pandas import ( + ArrowDtype, DataFrame, Index, Series, @@ -539,18 +540,38 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + # TODO: this belongs in comparison tests def test_pyarrow_numpy_string_invalid(self): # GH#56008 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") ser = Series([False, True]) ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") result = ser == ser2 - expected = Series(False, index=ser.index) - tm.assert_series_equal(result, expected) + expected_eq = Series(False, index=ser.index) + tm.assert_series_equal(result, expected_eq) result = ser != ser2 - expected = Series(True, index=ser.index) - tm.assert_series_equal(result, expected) + expected_ne = Series(True, index=ser.index) + tm.assert_series_equal(result, expected_ne) with pytest.raises(TypeError, match="Invalid comparison"): ser > ser2 + + # GH#59505 + ser3 = ser2.astype("string[pyarrow]") + result3_eq = ser3 == ser + tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]")) + result3_ne = ser3 != ser + tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser3 + + ser4 = ser2.astype(ArrowDtype(pa.string())) + result4_eq = ser4 == ser + tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]")) + result4_ne = ser4 != ser + tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser4 From fa14a190a4f0c3fcad50cf214fb52724b4255c6a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Aug 2024 09:08:54 +0200 Subject: [PATCH 24/52] TST (string dtype): clean up construction of expected string arrays (#59481) --- pandas/tests/io/excel/test_readers.py | 48 +++++++------------ pandas/tests/io/json/test_pandas.py | 38 ++++----------- .../io/parser/dtypes/test_dtypes_basic.py | 30 ++++-------- pandas/tests/io/parser/test_read_fwf.py | 31 ++++-------- pandas/tests/io/test_clipboard.py | 30 ++++-------- pandas/tests/io/test_feather.py | 28 ++++------- pandas/tests/io/test_html.py | 36 ++++---------- pandas/tests/io/test_sql.py | 29 +++-------- pandas/tests/io/xml/test_xml.py | 37 +++----------- 9 files changed, 82 insertions(+), 225 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1b79b4bff1cea..c899fd01ce7bb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -30,10 +30,6 @@ read_csv, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) if is_platform_windows(): pytestmark = pytest.mark.single_cpu @@ -663,41 +659,31 @@ def test_dtype_backend_and_dtype(self, read_ext): @pytest.mark.xfail( using_string_dtype(), reason="infer_string takes precedence", strict=False ) - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + df.to_excel(tmp_excel, sheet_name="test", index=False) with pd.option_context("mode.string_storage", string_storage): - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } + result = pd.read_excel( + tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" ) - with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, sheet_name="test", index=False) - result = pd.read_excel( - file_path, sheet_name="test", dtype_backend="numpy_nullable" - ) - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index cb94111aedffd..c0ecd4fd7cf59 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -31,11 +31,6 @@ read_json, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -2037,14 +2032,10 @@ def test_json_uint64(self): assert result == expected @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize( - "orient", ["split", "records", "values", "index", "columns"] - ) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 - pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2058,30 +2049,18 @@ def test_read_json_dtype_backend( } ) - if using_infer_string: - string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) - elif string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2090,12 +2069,13 @@ def test_read_json_dtype_backend( "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 5138dfb61eaac..e072e97637f2a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -18,11 +18,7 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) +from pandas.core.arrays import IntegerArray pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers): def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): parser = all_parsers @@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + } + ) + tm.assert_frame_equal(result, expected) def test_dtype_backend_ea_dtype_specified(all_parsers): diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 53426bebaa70b..72a341d8487e7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import EmptyDataError import pandas as pd @@ -24,10 +22,6 @@ DatetimeIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -968,39 +962,30 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - arr = ArrowExtensionArray(pa.array(["a", "b"])) - arr_na = ArrowExtensionArray(pa.array([None, "a"])) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), - "d": arr, + "d": pd.Series(["a", "b"], dtype=string_dtype), "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, + "h": pd.Series([None, "a"], dtype=string_dtype), "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index da998f058471c..3a52ff5acc0b3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -19,10 +19,6 @@ read_clipboard, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.clipboard import ( CheckedCall, @@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend( self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 - if string_storage == "pyarrow" or dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - - if string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow" and engine != "c": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + if engine == "c" and string_storage == "pyarrow": + # TODO avoid this exception? + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) + string_dtype = pd.StringDtype(string_storage) text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False @@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend( expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index d169fab3f1832..d1201686edefa 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,10 +6,6 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -188,25 +184,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -215,8 +203,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 298b9115b51e4..56c9cf8df0e89 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -31,17 +29,9 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import file_path_to_url -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture( params=[ @@ -160,7 +150,7 @@ def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), - columns=pd.Index(list("abc"), dtype=object), + columns=pd.Index(list("abc")), ) # pylint: disable-next=consider-using-f-string .map("{:.3f}".format).astype(float) @@ -186,24 +176,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -212,8 +194,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 792c532fa8032..b1557d71f15e4 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -42,10 +42,6 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.util.version import Version from pandas.io import sql @@ -3678,24 +3674,13 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name) -> DataFrame: - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray - if storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": + def func(string_storage, dtype_backend, conn_name) -> DataFrame: + string_dtype: pd.StringDtype | pd.ArrowDtype + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + string_dtype = pd.StringDtype(string_storage) df = DataFrame( { @@ -3705,8 +3690,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 35beda37acf51..04c040efa767c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -30,11 +28,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -2007,7 +2000,6 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): @@ -2038,36 +2030,21 @@ def test_read_xml_nullable_dtypes( """ - if using_infer_string: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) - - elif string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + with pd.option_context("mode.string_storage", string_storage): + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) - elif dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) - - with pd.option_context("mode.string_storage", string_storage): - result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) + string_dtype = pd.StringDtype(string_storage) expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), From 036e9da72ee3b0bac1e423069ac4c2efc6370983 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:07:41 -0400 Subject: [PATCH 25/52] TST (string dtype): clean up construction of expected string arrays (#59481) --- pandas/tests/io/excel/test_readers.py | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c899fd01ce7bb..8dc76d8f747cb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -659,31 +659,34 @@ def test_dtype_backend_and_dtype(self, read_ext): @pytest.mark.xfail( using_string_dtype(), reason="infer_string takes precedence", strict=False ) - def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): + def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } - ) - df.to_excel(tmp_excel, sheet_name="test", index=False) - with pd.option_context("mode.string_storage", string_storage): - result = pd.read_excel( - tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } ) - expected = DataFrame( - { - "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), - "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), - } - ) - tm.assert_frame_equal(result, expected) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, sheet_name="test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", dtype_backend="numpy_nullable" + ) + + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): From 4d26bed018510e64da0ea65abd8d47727a665d46 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:08:52 -0400 Subject: [PATCH 26/52] TST (string dtype): fix IO dtype_backend tests for storage of str dtype of columns' Index (#59509) --- pandas/tests/io/json/test_pandas.py | 4 +++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 12 ++++++------ pandas/tests/io/parser/test_read_fwf.py | 4 +++- pandas/tests/io/test_html.py | 4 +++- pandas/tests/io/xml/test_xml.py | 4 +++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c0ecd4fd7cf59..de40441fe25dd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2088,7 +2088,9 @@ def test_read_json_dtype_backend( if orient == "values": expected.columns = list(range(8)) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e072e97637f2a..800ece5a409e1 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -470,12 +470,12 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - expected = DataFrame( - { - "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), - "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), - } - ) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + }, + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 72a341d8487e7..d8fe168341ff1 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1001,7 +1001,9 @@ def test_dtype_backend(string_storage, dtype_backend): ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 56c9cf8df0e89..826c0a1ca7cf9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -211,7 +211,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.network @pytest.mark.single_cpu diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 04c040efa767c..92e89ddbc8e80 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -2065,7 +2065,9 @@ def test_read_xml_nullable_dtypes( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): From 31153c1e158676fe47a4379782811629e64a3042 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2024 12:44:56 -0700 Subject: [PATCH 27/52] REF (string): Move StringArrayNumpySemantics methods to base class (#59514) * REF (string): Move StringArrayNumpySemantics methods to base class * mypy fixup --- pandas/core/arrays/string_.py | 55 +++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c881437ba25af..f3e5e6fe5f3da 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -738,11 +738,23 @@ def astype(self, dtype, copy: bool = True): def _reduce( self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max"]: return getattr(self, name)(skipna=skipna, axis=axis) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -761,7 +773,11 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -812,7 +828,13 @@ def _cmp_method(self, other, op): # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method @@ -853,37 +875,6 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if name == "any": - return nanops.nanany(self._ndarray, skipna=skipna) - else: - return nanops.nanall(self._ndarray, skipna=skipna) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: - # the masked_reductions use pd.NA - if result is libmissing.NA: - return np.nan - return super()._wrap_reduction_result(axis, result) - - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - # ------------------------------------------------------------------------ # String methods interface _str_na_value = np.nan From 721bf1ef869961ec64d0b533a212768caf851177 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2024 10:45:12 -0700 Subject: [PATCH 28/52] REF (string): remove _str_na_value (#59515) * REF (string): remove _str_na_value * mypy fixup --- pandas/core/arrays/numpy_.py | 4 ---- pandas/core/arrays/string_.py | 10 ---------- pandas/core/arrays/string_arrow.py | 4 ---- pandas/core/strings/object_array.py | 10 ++++------ 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 07eb91e0cb13b..03712f75db0c7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -557,7 +557,3 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f3e5e6fe5f3da..a64be0b197494 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -838,12 +838,6 @@ def _cmp_method(self, other, op): _arith_method = _cmp_method - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - class StringArrayNumpySemantics(StringArray): _storage = "python" @@ -874,7 +868,3 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # need to override NumpyExtensionArray._from_backing_data to ensure # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 00d92958cc8dc..136411ad1a83d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -279,10 +279,6 @@ def _data(self): # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - _str_map = BaseStringArray._str_map def _str_contains( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0029beccc40a8..090e27ec58cc3 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -37,8 +37,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError @@ -56,7 +54,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -66,7 +64,7 @@ def _str_map( if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -270,7 +268,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -473,7 +471,7 @@ def _str_removesuffix(self, suffix: str) -> Series: def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: From ceee52d0ca662f4fa484d0943daeda6115f0e524 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2024 12:01:21 -0700 Subject: [PATCH 29/52] REF (string): move ArrowStringArrayNumpySemantics methods to base class (#59501) * REF: move ArrowStringArrayNumpySemantics methods to parent class * REF: move methods to ArrowStringArray * mypy fixup * Fix incorrect double-unpacking * move methods to subclass --- pandas/core/arrays/string_arrow.py | 109 +++++++++++++---------------- 1 file changed, 48 insertions(+), 61 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 136411ad1a83d..91c1f20ba93c6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( @@ -209,12 +208,17 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _result_converter(self, values, na=None): + if self.dtype.na_value is np.nan: + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -494,11 +498,30 @@ def _str_get_dummies(self, sep: str = "|"): return dummies.astype(np.int64, copy=False), labels def _convert_int_dtype(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_dtype(result) @@ -529,67 +552,31 @@ def _rank( ) ) - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow" - _na_value = np.nan - - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False + ) return result def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna and name == "all": - nas = pc.invert(pc.is_null(self._pa_array)) - arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + return result.to_numpy(np.bool_, na_value=False) + return result - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan + _str_get = ArrowStringArrayMixin._str_get + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_pad = ArrowStringArrayMixin._str_pad + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From 38f5b61939dc9333163640980250c71da3bf4128 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Aug 2024 10:29:26 -0700 Subject: [PATCH 30/52] API (string): return str dtype for .dt methods, DatetimeIndex methods (#59526) * API (string): return str dtype for .dt methods, DatetimeIndex methods * mypy fixup --- pandas/core/arrays/datetimelike.py | 6 +++++ pandas/core/arrays/datetimes.py | 17 +++++++++++++ pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/extension.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 24 ++++++++++++------- .../series/accessors/test_dt_accessor.py | 8 +++---- 6 files changed, 45 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1042a1b3fde61..e85c0222bbec3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( algos, lib, @@ -1789,6 +1791,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a146220d249e2..0db25db02e75a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -1306,6 +1308,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1363,6 +1372,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c978abd8c2427..3204a9c97ee73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -276,7 +276,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 61949531f37df..371d3c811e772 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -71,7 +71,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -98,7 +98,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 360ab960088ed..b346294a892d3 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -889,20 +889,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -1159,20 +1163,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 0dd2c227d6aa7..18ee81581bdc3 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -29,6 +29,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -528,7 +529,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -599,10 +599,9 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("str") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -635,7 +634,6 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ @@ -658,7 +656,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): From a35481f91f790c427e3a1dd7769399a90aca23a2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Aug 2024 14:23:07 -0400 Subject: [PATCH 31/52] Pick required fix from 2542674ee9 #56709 --- pandas/tests/computation/test_eval.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index e8fad6b8cbd63..cf3e50094ac97 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -606,11 +606,10 @@ def test_unary_in_array(self): ) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("expr", ["x < -0.1", "-5 > x"]) - def test_float_comparison_bin_op(self, dtype, expr): + def test_float_comparison_bin_op(self, float_numpy_dtype, expr): # GH 16363 - df = DataFrame({"x": np.array([0], dtype=dtype)}) + df = DataFrame({"x": np.array([0], dtype=float_numpy_dtype)}) res = df.eval(expr) assert res.values == np.array([False]) From 172af49331c2fab84b76463bade55978d51d6ad1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 21 Aug 2024 21:59:09 -0400 Subject: [PATCH 32/52] Pick required fix from f4232e7 #58006 --- pandas/tests/frame/test_reductions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a64f06f6c0521..a4263279a7bd5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1999,7 +1999,9 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support operation|Cannot perform"): + with pytest.raises( + TypeError, match="does not support (operation|reduction)|Cannot perform" + ): df.sum() From 7946df1d55ebfeeacf0e0a7422d82513ed355585 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:56:11 -0400 Subject: [PATCH 33/52] Pick required fix from #55901 and #59581 --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 72efe989804e4..ae8889faaf1c9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1314,6 +1314,7 @@ def test_empty_dataframe(self, fp): _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", ) + @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_timezone_aware_index(self, fp, timezone_aware_date_list): idx = 5 * [timezone_aware_date_list] From 6909c47cb802fd5190ec48f5b9afff96dd1bce4e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 14:47:08 -0400 Subject: [PATCH 34/52] Remove .pre-commit check for pytest ref #56671 --- .pre-commit-config.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b02ad7cf886f..9b3a9827e67e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -274,13 +274,6 @@ repos: language: python types: [rst] files: ^doc/source/(development|reference)/ - - id: unwanted-patterns-bare-pytest-raises - name: Check for use of bare pytest raises - language: python - entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" - types: [python] - files: ^pandas/tests/ - exclude: ^pandas/tests/extension/ - id: unwanted-patterns-private-function-across-module name: Check for use of private functions across modules language: python From b70cd487c61952768c0c9c1d5e2cb532f18ca20c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:48:34 -0400 Subject: [PATCH 35/52] Skip niche issue --- pandas/tests/strings/test_find_replace.py | 24 +++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 2d7c9754ee319..78ce1d7418886 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -233,14 +233,22 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype.na_value is np.nan: - expected = Series([True, True, True], dtype=np.bool_) - else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + # this particular combination of events is broken on 2.3 + # would require cherry picking #58483, which in turn requires #57481 + # which introduce many behavioral changes + if not ( + hasattr(any_string_dtype, "storage") + and any_string_dtype.storage == "python" + and any_string_dtype.na_value is np.nan + ): + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype.na_value is np.nan: + expected = Series([True, True, True], dtype=np.bool_) + else: + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = ( From 1718e4b80f8bc9161a09ab95874763353088d4a4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Aug 2024 15:17:38 -0400 Subject: [PATCH 36/52] Add required skip from #58467 --- pandas/tests/io/test_parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae8889faaf1c9..59662ec77d52f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -973,6 +973,8 @@ def test_timestamp_nanoseconds(self, pa): check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + pytest.importorskip("pyarrow", "11.0.0") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: request.applymarker( pytest.mark.xfail( From 3467d26492363b3004f510cee55fd20dc291c239 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:52:24 -0400 Subject: [PATCH 37/52] Remove tests that will fail without backport of #58437 --- pandas/tests/frame/test_query_eval.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 13232a0909c5b..7dde0683aa960 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -190,6 +190,25 @@ def test_eval_object_dtype_binop(self): expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): From 9142e5e15ce393e89f4ff4cab84ef65247303e9c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Sep 2024 23:32:38 +0200 Subject: [PATCH 38/52] additional test fixes (for tests that changed or no longer exist on main) --- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/methods/test_align.py | 3 +++ pandas/tests/frame/methods/test_quantile.py | 3 --- pandas/tests/frame/methods/test_to_csv.py | 1 + pandas/tests/frame/methods/test_update.py | 2 +- pandas/tests/groupby/test_apply.py | 11 +++++++---- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_raises.py | 2 +- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_indexing.py | 15 ++++++++++++--- .../io/parser/common/test_file_buffer_url.py | 4 ++-- pandas/tests/io/test_common.py | 4 +++- pandas/tests/io/test_pickle.py | 10 ++++++++-- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/tools/test_to_datetime.py | 6 ++---- 15 files changed, 45 insertions(+), 26 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 795e6b974ca34..2ab518405503c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -519,7 +519,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 5a9c47866dae8..15a97a99caa5a 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -155,6 +157,7 @@ def test_align_series_condition(self): expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_align_int(self, int_frame): # test other non-float types other = DataFrame(index=range(5), columns=["A", "B", "C"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index ec070467b242e..15af2a14a042e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -354,7 +354,6 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -408,7 +407,6 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -679,7 +677,6 @@ def test_quantile_nat(self, interp_method, request, using_array_manager, unit): ) tm.assert_frame_equal(res, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 20a8e95f990ec..4a65c3929944b 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -35,6 +35,7 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1(self, float_frame, datetime_frame): with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 8af1798aa8e00..56700ab6bd1f7 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -169,7 +169,7 @@ def test_update_with_different_dtype(self, using_copy_on_write): { "a": [1, 3], "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), + "c": Series(["foo", np.nan]), } ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d91510d834e6c..cc736f2bf53ba 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -126,7 +128,7 @@ def test_apply_trivial(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" @@ -143,7 +145,7 @@ def test_apply_trivial_fail(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1299,12 +1301,13 @@ def test_apply_dropna_with_indexed_same(dropna): @pytest.mark.parametrize( "as_index, expected", [ - [ + pytest.param( False, DataFrame( [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) ), - ], + marks=pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)"), + ), [ True, Series( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 10f45cac1ff66..015a9db32883b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1005,7 +1005,7 @@ def test_raises_on_nuisance(df): depr_msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=depr_msg): grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "does not support reduction 'sum'" + msg = "does not support reduction 'sum'|Cannot perform reduction 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 64780d0ba03d8..d5b7a3f25d0eb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -219,7 +219,7 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index c0a62ecb06f56..ac3bfe3a13a44 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -834,6 +834,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -854,7 +855,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key msg = "Downcasting behavior in `replace`" warn = FutureWarning diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 908e95accfb0f..e57598cfc2be1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -687,7 +687,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - def test_rhs_alignment(self): + def test_rhs_alignment(self, using_infer_string): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right_loc, right_iloc): @@ -731,8 +731,17 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - run_tests(df, rhs, right_loc, right_iloc) + if using_infer_string: + with pytest.raises( + TypeError, match="Must provide strings|Scalar must be NA or str" + ): + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype" + ): + run_tests(df, rhs, right_loc, right_iloc) + else: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( "idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)] diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 1f5021c8a24cc..c13b77f365496 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -87,8 +87,8 @@ def test_path_local_path(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_localpath( df.to_csv, lambda p: parser.read_csv(p, index_col=0) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 75ecd1d929d58..d38f716cf6a98 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -308,10 +308,12 @@ def test_read_expands_user_home_dir( "pyarrow", ("io", "data", "feather", "feather-0_3_1.feather"), ), - ( + pytest.param( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + # cleaned-up in https://github.com/pandas-dev/pandas/pull/57387 on main + marks=pytest.mark.xfail(reason="TODO(infer_string)", strict=False), ), (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 4f3993a038197..05f4a20ee42d8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -413,10 +413,16 @@ def test_read(self, protocol, get_random_path): @pytest.mark.parametrize( ["pickle_file", "excols"], [ - ("test_py27.pkl", Index(["a", "b", "c"])), + ("test_py27.pkl", Index(["a", "b", "c"], dtype=object)), ( "test_mi_py27.pkl", - pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), + pd.MultiIndex( + [ + Index(["a", "b", "c"], dtype=object), + Index(["A", "B", "C"], dtype=object), + ], + [np.array([0, 1, 2]), np.array([0, 1, 2])], + ), ), ], ) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3c5e843e2e97b..09509fb495034 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1563,8 +1563,8 @@ def test_path_pathlib(self): def test_pickle_path_localpath(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ede38ce9c9a09..e6ae3c1a39bd0 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1207,10 +1207,8 @@ def test_out_of_bounds_errors_ignore2(self): # GH#12424 msg = "errors='ignore' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_datetime( - Series(["2362-01-01", np.nan], dtype=object), errors="ignore" - ) - exp = Series(["2362-01-01", np.nan], dtype=object) + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan]) tm.assert_series_equal(res, exp) def test_to_datetime_tz(self, cache): From b61bd23a6e5b6b1c233c2a950d76ad5d7c87f8ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Aug 2024 21:31:40 +0200 Subject: [PATCH 39/52] String dtype: still return nullable NA-variant in object inference (`maybe_converts_object`) if requested (#59487) * String dtype: maybe_converts_object give precedence to nullable dtype * update datetimelike input validation * update tests and remove xfails * explicitly test pd.array() behaviour (remove xfail) * fixup allow_2d * undo changes related to datetimelike input validation * fix test for str on current main --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .../tests/arrays/string_/test_string_arrow.py | 5 +-- pandas/tests/arrays/test_array.py | 40 ++++++++++++++++++- pandas/tests/arrays/test_datetimelike.py | 7 +--- pandas/tests/base/test_value_counts.py | 4 +- .../dtypes/cast/test_construct_ndarray.py | 2 +- .../io/parser/usecols/test_usecols_basic.py | 3 -- 6 files changed, 46 insertions(+), 15 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 72d672ba8a7d9..e6957feecf4b5 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string): result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype( - string_storage, na_value=np.nan if using_infer_string else pd.NA - ) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 76b42b643ee69..158a963845b06 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -218,6 +218,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -225,6 +234,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -277,7 +309,6 @@ def test_array_copy(): cet = pytz.timezone("CET") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -370,6 +401,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b346294a892d3..ede81264cb415 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -295,9 +295,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings( - self, arr1d, box, string_storage, using_infer_string - ): + def test_searchsorted_castable_strings(self, arr1d, box, string_storage): arr = arr1d if box is None: pass @@ -333,8 +331,7 @@ def test_searchsorted_castable_strings( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got " - f"{'str' if using_infer_string else 'string'} array instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 2729666398877..1f643f24ed5f7 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -127,7 +127,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -205,7 +205,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 24937de163662..767fba666e417 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError from pandas import ( @@ -547,7 +545,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ From c3d398032caa4f2136d99d751daf072255aa004d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Sep 2024 13:36:31 +0200 Subject: [PATCH 40/52] Enable CoW in the string test build --- .github/workflows/unit-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index eef899173403b..bd7da3a804634 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -89,9 +89,11 @@ jobs: - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" + pandas_copy_on_write: "1" - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" + pandas_copy_on_write: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" From e3728c7502c932c75b868847abfda1a9a230ab9f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 23 Sep 2024 20:27:32 -0400 Subject: [PATCH 41/52] Skip test if pyarrow not installed in test_numeric_only --- pandas/tests/groupby/test_numeric_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 029d322e4fdc3..b1fa541d42086 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -275,7 +275,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str "has no kernel", ) if using_infer_string: - import pyarrow as pa + pa = pytest.importorskip("pyarrow") errs = (TypeError, pa.lib.ArrowNotImplementedError) else: From 732aa90af5927f097d1109456df063815ae1932e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Sep 2024 18:39:05 -0400 Subject: [PATCH 42/52] pick out stringarray keepdims changes from #59234 --- pandas/core/arrays/string_.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a64be0b197494..1aa6fb70d250c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -736,7 +736,13 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): if self.dtype.na_value is np.nan and name in ["any", "all"]: if name == "any": @@ -745,8 +751,10 @@ def _reduce( return nanops.nanall(self._ndarray, skipna=skipna) if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) - + result = getattr(self, name)(skipna=skipna, axis=axis) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: From 66e26d1410ea29b6ca6f67d9e445d14cb256714e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 15:33:55 +0200 Subject: [PATCH 43/52] Fix: avoid object dtype inference warning in to_datetime --- pandas/core/tools/datetimes.py | 5 +++++ pandas/tests/tools/test_to_datetime.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05262c235568d..8f700cfa63132 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -16,6 +16,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -476,6 +478,9 @@ def _array_strptime_with_fallback( unit = np.datetime_data(result.dtype)[0] res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) return res + elif using_string_dtype() and result.dtype == object: + if lib.is_string_array(result): + return Index(result, dtype="str", name=name) return Index(result, dtype=result.dtype, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e6ae3c1a39bd0..e7e8f3ac63cd1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1492,7 +1492,9 @@ def test_datetime_invalid_index(self, values, format): warn, match="Could not infer format", raise_on_extra_warnings=False ): res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values, dtype=object)) + tm.assert_index_equal( + res, Index(values, dtype="object" if format is None else "str") + ) with tm.assert_produces_warning( warn, match="Could not infer format", raise_on_extra_warnings=False @@ -3713,7 +3715,7 @@ def test_to_datetime_mixed_not_necessarily_iso8601_raise(): ("errors", "expected"), [ ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), + ("ignore", Index(["2020-01-01", "01-01-2000"], dtype="str")), ], ) def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): From e9806c1216ffb1b880d59ce061dd72829f6053e2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 15:43:57 +0200 Subject: [PATCH 44/52] xfail tests that trigger dtype inference warnings --- pandas/tests/frame/indexing/test_indexing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 2ab518405503c..7a7586961deca 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -906,6 +906,8 @@ def test_setitem_frame_float(self, float_frame): expected = piece.values tm.assert_almost_equal(result, expected) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed(self, float_string_frame): # GH 3216 @@ -918,6 +920,8 @@ def test_setitem_frame_mixed(self, float_string_frame): f.loc[key] = piece tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): # GH#3216 rows unaligned f = float_string_frame.copy() @@ -932,6 +936,8 @@ def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] ) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_key_unaligned(self, float_string_frame): # GH#3216 key is unaligned with values f = float_string_frame.copy() From db9aa774e0d355445dc2919b50b06d8a3b4fc735 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:09:05 +0200 Subject: [PATCH 45/52] avoid dtype inference warnings by removing explicit dtype=object --- pandas/tests/dtypes/test_missing.py | 2 +- pandas/tests/frame/methods/test_set_index.py | 4 ++-- pandas/tests/series/indexing/test_getitem.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index e1f8d8eca2537..e3d3e98ae2b93 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -131,7 +131,7 @@ def test_isna_isnull(self, isna_f): [ np.arange(4, dtype=float), [0.0, 1.0, 0.0, 1.0], - Series(list("abcd"), dtype=object), + Series(list("abcd")), date_range("2020-01-01", periods=4), ], ) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5724f79b82578..1c8d365f0d6c0 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -158,8 +158,8 @@ def test_set_index(self, float_string_frame): def test_set_index_names(self): df = DataFrame( np.ones((10, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(10)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(10)]), ) df.index.name = "name" diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 9783dcd2fea07..9891684e9597c 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -360,7 +360,7 @@ def test_getitem_no_matches(self, box): # GH#33462 we expect the same behavior for list/ndarray/Index/Series ser = Series(["A", "B"]) - key = Series(["C"], dtype=object) + key = Series(["C"]) key = box(key) msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" From b3257e77e963c27b3f0589310165f9405dee415a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:40:59 +0200 Subject: [PATCH 46/52] un-xfail tests for replace/fillna downcasting --- pandas/tests/frame/methods/test_combine_first.py | 3 --- pandas/tests/frame/methods/test_fillna.py | 2 -- pandas/tests/frame/methods/test_replace.py | 3 --- pandas/tests/series/methods/test_replace.py | 1 - 4 files changed, 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index a4ee0b08e1e66..8aeab5dacd8b4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -32,7 +30,6 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index d767e35878b52..e2baa2567f5b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -126,7 +126,6 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -371,7 +370,6 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fd8039975a514..0884c091ba96a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,7 +624,6 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -1444,7 +1443,6 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1540,7 +1538,6 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c6727e023e786..850740fac907d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -760,7 +760,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) From cecef0e3002ee44390baf89c2535edc536c6c5c7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:51:40 +0200 Subject: [PATCH 47/52] xfail tests triggering empty concat warning --- pandas/tests/reshape/concat/test_empty.py | 4 ++++ pandas/tests/reshape/merge/test_join.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 9560087615123..8f7ea0c42f2c3 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -238,6 +240,8 @@ def test_concat_empty_dataframe_dtypes(self): assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_inner_join_empty(self): # GH 15328 df_empty = DataFrame() diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 91f0cf6c31085..9188521c71158 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -341,6 +343,8 @@ def test_join_index_mixed_overlap(self): expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() From 4c0d11830e0c63080410b4ba524fc87322b2ce0a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 21:17:36 +0200 Subject: [PATCH 48/52] Update xfails for 2.3.x --- pandas/tests/arrays/categorical/test_analytics.py | 10 +--------- pandas/tests/copy_view/test_array.py | 3 --- pandas/tests/copy_view/test_interp_fillna.py | 4 +--- pandas/tests/frame/methods/test_info.py | 3 +-- pandas/tests/groupby/aggregate/test_cython.py | 1 - pandas/tests/indexes/base_class/test_reshape.py | 5 ++++- pandas/tests/indexing/test_loc.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 3 ++- pandas/tests/series/methods/test_replace.py | 1 + pandas/tests/series/test_logical_ops.py | 4 +++- 10 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index a38814ca43773..c2c53fbc4637e 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,12 +4,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - PYPY, -) +from pandas.compat import PYPY from pandas import ( Categorical, @@ -301,9 +296,6 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 5d0efdc149004..9a3f83e0293f5 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -159,7 +157,6 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 1f7f8689d0779..338b76cbf1e7a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas import ( NA, ArrowDtype, @@ -161,7 +159,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 4594f725b43d5..475632667a87a 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,7 +10,6 @@ from pandas._config import using_string_dtype from pandas.compat import ( - HAS_PYARROW, IS64, PYPY, ) @@ -521,7 +520,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index fbbace54a3444..fa8a6cb4120b2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -164,7 +164,6 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 814a6a516904b..6a544e448ebe1 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index import pandas._testing as tm @@ -35,7 +36,9 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture, using_infer_string): + def test_insert_missing(self, request, nulls_fixture, using_infer_string): + if using_infer_string and nulls_fixture is pd.NA: + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH#22295 # test there is no mangling of NA values expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4d2d1e336ef07..d61b2ea642439 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1443,6 +1443,9 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected): tm.assert_frame_equal(expected, df) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 95a7664304889..2abca1bf52374 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -14,6 +14,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -917,7 +918,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 850740fac907d..c59dbc4ed95d7 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,6 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "categorical, numeric", [ diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index ff21427b71cf9..a9f1726afc942 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -368,7 +368,9 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based From fc6bd396abaa51bb6957af063274bcc0c63e7533 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 21:50:50 +0200 Subject: [PATCH 49/52] Fix string dtype comparison in value_counts dtype inference deprecation --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15a07da76d2f7..56600bd9a5107 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -935,7 +935,7 @@ def value_counts_internal( idx = idx.astype(object) elif ( idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 - and idx.dtype != "string[pyarrow_numpy]" + and idx.dtype != "string" ): warnings.warn( # GH#56161 From bae9be1aa72bde07e234c59f16257e291bcb59b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 22:15:02 +0200 Subject: [PATCH 50/52] string[pyarrow_numpy] -> str --- pandas/tests/frame/methods/test_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0884c091ba96a..ccee7ca24bd3d 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -452,7 +452,7 @@ def test_regex_replace_string_types( ) with tm.assert_produces_warning(FutureWarning, match="Downcasting"): result = obj.replace(to_replace, regex=True) - dtype = "string[pyarrow_numpy]" + dtype = "str" else: result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) From 94b797d83fd5d69c4500ad0c6fe97c9e26785cea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 09:33:07 +0200 Subject: [PATCH 51/52] Fix cow ref tracking in replace with list and regex --- pandas/core/internals/blocks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cd1639188b1ad..917a65348b7a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1208,6 +1208,7 @@ def _replace_coerce( value, inplace=inplace, mask=mask, + using_cow=using_cow, ) else: if value is None: From a10c5c0e20466d0cf7619dc354f7910dc359146f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 15:45:42 +0200 Subject: [PATCH 52/52] suppress pylint errors --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 238abd85dcdb1..ac91f0dcc7269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -476,7 +476,11 @@ disable = [ "unnecessary-lambda", "unused-argument", "unused-variable", - "using-constant-test" + "using-constant-test", + + # disabled on 2.3.x branch + "consider-using-in", + "simplifiable-if-expression", ] [tool.pytest.ini_options]