From f6ab662682f984ab79bbd6891c72f27d0ca63cae Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 08:49:13 -0800 Subject: [PATCH] CLN: assorted comments --- pandas/_libs/internals.pyx | 1 + pandas/_libs/lib.pyi | 1 - pandas/_libs/lib.pyx | 25 +++----- pandas/_libs/parsers.pyx | 13 ++--- pandas/core/frame.py | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 5 +- pandas/core/indexes/base.py | 38 ++++++------ pandas/core/indexes/multi.py | 5 ++ pandas/core/indexing.py | 3 + pandas/core/internals/blocks.py | 50 ++++++++-------- pandas/core/internals/concat.py | 3 - pandas/core/internals/construction.py | 9 +++ pandas/core/internals/managers.py | 2 + pandas/io/pytables.py | 4 ++ pandas/io/sas/sas.pyx | 2 +- pandas/tests/extension/test_string.py | 14 ++--- .../tests/groupby/transform/test_transform.py | 58 +++++++++---------- pandas/tests/strings/test_extract.py | 14 +++-- 19 files changed, 124 insertions(+), 127 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index b5ff69d92492f..51c2486ac5c1d 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -7,6 +7,7 @@ from cython cimport Py_ssize_t cdef extern from "Python.h": + # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX Py_ssize_t PY_SSIZE_T_MAX import numpy as np diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 72b46d9e30684..1f6ae49b76adc 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -217,7 +217,6 @@ def count_level_2d( mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], labels: np.ndarray, # const intp_t[:] max_bin: int, - axis: int, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( label: np.ndarray, # const int64_t[:] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d79f7068effc3..7f718fde79dd1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -921,29 +921,19 @@ def get_level_sorter( def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, const intp_t[:] labels, Py_ssize_t max_bin, - int axis): + ): cdef: Py_ssize_t i, j, k, n ndarray[int64_t, ndim=2] counts - assert (axis == 0 or axis == 1) n, k = (mask).shape - if axis == 0: - counts = np.zeros((max_bin, k), dtype="i8") - with nogil: - for i in range(n): - for j in range(k): - if mask[i, j]: - counts[labels[i], j] += 1 - - else: # axis == 1 - counts = np.zeros((n, max_bin), dtype="i8") - with nogil: - for i in range(n): - for j in range(k): - if mask[i, j]: - counts[i, labels[j]] += 1 + counts = np.zeros((n, max_bin), dtype="i8") + with nogil: + for i in range(n): + for j in range(k): + if mask[i, j]: + counts[i, labels[j]] += 1 return counts @@ -1710,6 +1700,7 @@ cdef class Validator: cdef bint is_valid_null(self, object value) except -1: return value is None or value is C_NA or util.is_nan(value) + # TODO: include decimal NA? cdef bint is_array_typed(self) except -1: return False diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 20a25afa6a51f..5bddaa61d3196 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -848,6 +848,9 @@ cdef class TextReader: with nogil: status = tokenize_nrows(self.parser, nrows, self.encoding_errors) + self._check_tokenize_status(status) + + cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: print(PyUnicode_DecodeUTF8( self.parser.warn_msg, strlen(self.parser.warn_msg), @@ -879,15 +882,7 @@ cdef class TextReader: with nogil: status = tokenize_all_rows(self.parser, self.encoding_errors) - if self.parser.warn_msg != NULL: - print(PyUnicode_DecodeUTF8( - self.parser.warn_msg, strlen(self.parser.warn_msg), - self.encoding_errors), file=sys.stderr) - free(self.parser.warn_msg) - self.parser.warn_msg = NULL - - if status < 0: - raise_parser_error("Error tokenizing data", self.parser) + self._check_tokenize_status(status) if self.parser_start >= self.parser.lines: raise StopIteration diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71dc3b523fca6..7e7aa93046430 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7814,7 +7814,7 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # sorts if possible + # sorts if possible; otherwise align above ensures that these are set-equal new_columns = this.columns.union(other.columns) do_fill = fill_value is not None result = {} diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index eecf292e4c3c8..799d9cf350513 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1424,7 +1424,7 @@ def _wrap_applied_output_series( values: list[Series], not_indexed_same: bool, first_not_none, - key_index, + key_index: Index | None, is_transform: bool, ) -> DataFrame | Series: kwargs = first_not_none._construct_axes_dict() diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8dcf7a0838349..bd7cdbdd40969 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1043,7 +1043,9 @@ def _concat_objects( # when the ax has duplicates # so we resort to this # GH 14776, 30667 + # TODO: can we re-use e.g. _reindex_non_unique? if ax.has_duplicates and not result.axes[self.axis].equals(ax): + # e.g. test_category_order_transformer target = algorithms.unique1d(ax._values) indexer, _ = result.index.get_indexer_non_unique(target) result = result.take(indexer, axis=self.axis) @@ -1460,6 +1462,7 @@ def _agg_py_fallback( NotImplementedError. """ # We get here with a) EADtypes and b) object dtype + assert alt is not None if values.ndim == 1: # For DataFrameGroupBy we only get here with ExtensionArray @@ -1775,7 +1778,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: else: masked = mask & ~isna(bvalues) - counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) if is_series: assert counted.ndim == 2 assert counted.shape[0] == 1 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6031bdc62c38a..633d2c1ab30ac 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -78,7 +78,6 @@ LossySetitemError, can_hold_element, common_dtype_categorical_compat, - ensure_dtype_can_hold_na, find_result_type, infer_dtype_from, maybe_cast_pointwise_result, @@ -351,6 +350,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]: # can_use_libjoin assures sv and ov are ndarrays sv = cast(np.ndarray, sv) ov = cast(np.ndarray, ov) + # similar but not identical to ov.searchsorted(sv) return libjoin.left_join_indexer_unique(sv, ov) @final @@ -3132,7 +3132,7 @@ def union(self, other, sort=None): if not is_dtype_equal(self.dtype, other.dtype): if ( isinstance(self, ABCMultiIndex) - and not is_object_dtype(unpack_nested_dtype(other)) + and not is_object_dtype(_unpack_nested_dtype(other)) and len(other) > 0 ): raise NotImplementedError( @@ -3213,6 +3213,8 @@ def _union(self, other: Index, sort): result_dups = algos.union_with_duplicates(self, other) return _maybe_try_sort(result_dups, sort) + # The rest of this method is analogous to Index._intersection_via_get_indexer + # Self may have duplicates; other already checked as unique # find indexes of things in "other" that are not in "self" if self._index_as_unique: @@ -3800,7 +3802,7 @@ def _should_partial_index(self, target: Index) -> bool: return False # See https://github.com/pandas-dev/pandas/issues/47772 the commented # out code can be restored (instead of hardcoding `return True`) - # once that issue if fixed + # once that issue is fixed # "Index" has no attribute "left" # return self.left._should_compare(target) # type: ignore[attr-defined] return True @@ -4778,6 +4780,9 @@ def _join_monotonic( assert other.dtype == self.dtype if self.equals(other): + # This is a convenient place for this check, but its correctness + # does not depend on monotonicity, so it could go earlier + # in the calling method. ret_index = other if how == "right" else self return ret_index, None, None @@ -5762,6 +5767,9 @@ def get_indexer_non_unique( that = target.astype(dtype, copy=False) return this.get_indexer_non_unique(that) + # TODO: get_indexer has fastpaths for both Categorical-self and + # Categorical-target. Can we do something similar here? + # Note: _maybe_promote ensures we never get here with MultiIndex # self and non-Multi target tgt_values = target._get_engine_target() @@ -5922,7 +5930,7 @@ def _get_indexer_non_comparable( If doing an inequality check, i.e. method is not None. """ if method is not None: - other = unpack_nested_dtype(target) + other = _unpack_nested_dtype(target) raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) @@ -5998,16 +6006,6 @@ def _find_common_type_compat(self, target) -> DtypeObj: Implementation of find_common_type that adjusts for Index-specific special cases. """ - if is_valid_na_for_dtype(target, self.dtype): - # e.g. setting NA value into IntervalArray[int64] - dtype = ensure_dtype_can_hold_na(self.dtype) - if is_dtype_equal(self.dtype, dtype): - raise NotImplementedError( - "This should not be reached. Please report a bug at " - "github.com/pandas-dev/pandas" - ) - return dtype - target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) # special case: if one dtype is uint64 and the other a signed int, return object @@ -6040,7 +6038,7 @@ def _should_compare(self, other: Index) -> bool: # respectively. return False - other = unpack_nested_dtype(other) + other = _unpack_nested_dtype(other) dtype = other.dtype return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) @@ -6052,6 +6050,8 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return dtype.kind == "b" elif is_numeric_dtype(self.dtype): return is_numeric_dtype(dtype) + # TODO: this was written assuming we only get here with object-dtype, + # which is nom longer correct. Can we specialize for EA? return True @final @@ -7141,7 +7141,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: return names -def unpack_nested_dtype(other: _IndexT) -> _IndexT: +def _unpack_nested_dtype(other: Index) -> Index: """ When checking if our dtype is comparable with another, we need to unpack CategoricalDtype to look at its categories.dtype. @@ -7155,12 +7155,10 @@ def unpack_nested_dtype(other: _IndexT) -> _IndexT: Index """ dtype = other.dtype - if is_categorical_dtype(dtype): + if isinstance(dtype, CategoricalDtype): # If there is ever a SparseIndex, this could get dispatched # here too. - # error: Item "dtype[Any]"/"ExtensionDtype" of "Union[dtype[Any], - # ExtensionDtype]" has no attribute "categories" - return dtype.categories # type: ignore[union-attr] + return dtype.categories return other diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3d8948615f288..0d0fc4a779120 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2746,6 +2746,7 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: Index.get_loc : The get_loc method for (single-level) index. """ if is_scalar(key) and isna(key): + # TODO: need is_valid_na_for_dtype(key, level_index.dtype) return -1 else: return level_index.get_loc(key) @@ -2818,6 +2819,8 @@ def _maybe_to_slice(loc): ) if keylen == self.nlevels and self.is_unique: + # TODO: what if we have an IntervalIndex level? + # i.e. do we need _index_as_unique on that level? try: return self._engine.get_loc(key) except TypeError: @@ -3853,6 +3856,8 @@ def maybe_droplevels(index: Index, key) -> Index: # drop levels original_index = index if isinstance(key, tuple): + # Caller is responsible for ensuring the key is not an entry in the first + # level of the MultiIndex. for _ in key: try: index = index._drop_level_numbers([0]) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d981d8e097dbb..c1435ebbe39ef 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1779,6 +1779,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value else: + # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( @@ -1866,6 +1867,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): self._setitem_with_indexer_frame_value(indexer, value, name) elif np.ndim(value) == 2: + # TODO: avoid np.ndim call in case it isn't an ndarray, since + # that will construct an ndarray, which will be wasteful self._setitem_with_indexer_2d_value(indexer, value) elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index db5cb4a70c8f1..0658b2039f085 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -429,6 +429,7 @@ def _maybe_downcast( return blocks if self.dtype == _dtype_obj: + # TODO: does it matter that self.dtype might not match blocks[i].dtype? # GH#44241 We downcast regardless of the argument; # respecting 'downcast=None' may be worthwhile at some point, # but ATM it breaks too much existing code. @@ -1817,39 +1818,38 @@ def _unwrap_setitem_indexer(self, indexer): """ # TODO: ATM this doesn't work for iget/_slice, can we change that? - if isinstance(indexer, tuple): + if isinstance(indexer, tuple) and len(indexer) == 2: # TODO(EA2D): not needed with 2D EAs # Should never have length > 2. Caller is responsible for checking. # Length 1 is reached vis setitem_single_block and setitem_single_column # each of which pass indexer=(pi,) - if len(indexer) == 2: - if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer): - # GH#44703 went through indexing.maybe_convert_ix - first, second = indexer - if not ( - second.size == 1 and (second == 0).all() and first.shape[1] == 1 - ): - raise NotImplementedError( - "This should not be reached. Please report a bug at " - "github.com/pandas-dev/pandas/" - ) - indexer = first[:, 0] - - elif lib.is_integer(indexer[1]) and indexer[1] == 0: - # reached via setitem_single_block passing the whole indexer - indexer = indexer[0] - - elif com.is_null_slice(indexer[1]): - indexer = indexer[0] - - elif is_list_like(indexer[1]) and indexer[1][0] == 0: - indexer = indexer[0] - - else: + if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer): + # GH#44703 went through indexing.maybe_convert_ix + first, second = indexer + if not ( + second.size == 1 and (second == 0).all() and first.shape[1] == 1 + ): raise NotImplementedError( "This should not be reached. Please report a bug at " "github.com/pandas-dev/pandas/" ) + indexer = first[:, 0] + + elif lib.is_integer(indexer[1]) and indexer[1] == 0: + # reached via setitem_single_block passing the whole indexer + indexer = indexer[0] + + elif com.is_null_slice(indexer[1]): + indexer = indexer[0] + + elif is_list_like(indexer[1]) and indexer[1][0] == 0: + indexer = indexer[0] + + else: + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/" + ) return indexer @property diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 3c357bd7516c0..a33ce8fd5c459 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -596,9 +596,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat): # TODO(EA2D): special case not needed if all EAs used HybridBlocks - # NB: we are still assuming here that Hybrid blocks have shape (1, N) - # concatting with at least one EA means we are concatting a single column - # the non-EA values are 2D arrays with shape (1, n) # error: No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[int, slice]" diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 934d08341395c..4dbdd5e5b77fe 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -362,6 +362,7 @@ def ndarray_to_mgr( block_values = [nb] if len(columns) == 0: + # TODO: check len(values) == 0? block_values = [] return create_block_manager_from_blocks( @@ -506,6 +507,8 @@ def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray: # We only get here with `not treat_as_nested(values)` if len(values) == 0: + # TODO: check for length-zero range, in which case return int64 dtype? + # TODO: re-use anything in try_cast? return np.empty((0, 0), dtype=object) elif isinstance(values, range): arr = range_to_ndarray(values) @@ -1007,6 +1010,12 @@ def convert(arr): try_float=coerce_float, convert_to_nullable_dtype=use_nullable_dtypes, ) + # Notes on cases that get here 2023-02-15 + # 1) we DO get here when arr is all Timestamps and dtype=None + # 2) disabling this doesn't break the world, so this must be + # getting caught at a higher level + # 3) passing convert_datetime to maybe_convert_objects get this right + # 4) convert_timedelta? if dtype is None: if arr.dtype == np.dtype("O"): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9de801b732544..ccfa5ae57b255 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -994,6 +994,8 @@ def fast_xs(self, loc: int) -> SingleBlockManager: np.ndarray or ExtensionArray """ if len(self.blocks) == 1: + # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; + # is this ruled out in the general case? result = self.blocks[0].iget((slice(None), loc)) # in the case of a single block, the new block is a view block = new_block( diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4a95daafd82a9..7461bad99462c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4024,6 +4024,10 @@ def get_blk_items(mgr): blk_items: list[Index] = get_blk_items(mgr) if len(data_columns): + # TODO: prove that we only get here with axis == 1? + # It is the case in all extant tests, but NOT the case + # outside this `if len(data_columns)` check. + axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) mgr = frame.reindex(new_labels, axis=axis)._mgr diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 4fe0f5ce91a51..6669686d7aa2c 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,5 +1,5 @@ # cython: language_level=3, initializedcheck=False -# cython: warn.undeclared=True, warn.maybe_uninitialized=True, warn.unused=True +# cython: warn.maybe_uninitialized=True, warn.unused=True from cython cimport Py_ssize_t from libc.stddef cimport size_t from libc.stdint cimport ( diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 0743c1e26c62f..ee855bb1cde8c 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -113,8 +113,7 @@ def test_is_not_string_type(self, dtype): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): if data.dtype.storage == "pyarrow": - mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) + pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -134,8 +133,7 @@ def test_constructor_from_list(self): class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, request): if data.dtype.storage == "pyarrow": - mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) + pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) @@ -146,8 +144,7 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): def test_setitem_preserves_views(self, data, request): if data.dtype.storage == "pyarrow": - mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) + pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -391,10 +388,7 @@ class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) def arrow_not_supported(self, data, request): if isinstance(data, ArrowStringArray): - mark = pytest.mark.xfail( - reason="2D support not implemented for ArrowStringArray" - ) - request.node.add_marker(mark) + pytest.skip(reason="2D support not implemented for ArrowStringArray") def test_searchsorted_with_na_raises(data_for_sorting, as_series): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 7ead3890c5130..8abcc52db0500 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -892,7 +892,6 @@ def test_pad_stable_sorting(fill_method): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize( "freq", [ @@ -908,8 +907,8 @@ def test_pad_stable_sorting(fill_method): @pytest.mark.parametrize("periods", [1, -1]) @pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) @pytest.mark.parametrize("limit", [None, 1]) -def test_pct_change(test_series, freq, periods, fill_method, limit): - # GH 21200, 21621, 30463 +def test_pct_change(frame_or_series, freq, periods, fill_method, limit): + # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) @@ -922,16 +921,17 @@ def test_pct_change(test_series, freq, periods, fill_method, limit): expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 - if test_series: - result = df.groupby("key")["vals"].pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq - ) - tm.assert_series_equal(result, expected) + gb = df.groupby("key") + + if frame_or_series is Series: + gb = gb["vals"] else: - result = df.groupby("key").pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq - ) - tm.assert_frame_equal(result, expected.to_frame("vals")) + expected = expected.to_frame("vals") + + result = gb.pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -1096,19 +1096,16 @@ def test_transform_invalid_name_raises(): g.transform("some_arbitrary_name") -@pytest.mark.parametrize( - "obj", - [ - DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], - ), - Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]), - ], -) -def test_transform_agg_by_name(request, reduction_func, obj): +def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func + obj = DataFrame( + {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, + index=["A", "B", "C", "D", "E", "F"], + ) + if frame_or_series is Series: + obj = obj["a"] + g = obj.groupby(np.repeat([0, 1], 3)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 @@ -1444,18 +1441,15 @@ def test_null_group_str_transformer_series(dropna, transformation_func): @pytest.mark.parametrize( - "func, series, expected_values", + "func, expected_values", [ - (Series.sort_values, False, [5, 4, 3, 2, 1]), - (lambda x: x.head(1), False, [5.0, np.nan, 3, 2, np.nan]), - # SeriesGroupBy already has correct behavior - (Series.sort_values, True, [5, 4, 3, 2, 1]), - (lambda x: x.head(1), True, [5.0, np.nan, 3.0, 2.0, np.nan]), + (Series.sort_values, [5, 4, 3, 2, 1]), + (lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]), ], ) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) @pytest.mark.parametrize("keys_in_index", [True, False]) -def test_transform_aligns(func, series, expected_values, keys, keys_in_index): +def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index): # GH#45648 - transform should align with the input's index df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]}) if "a2" in keys: @@ -1464,12 +1458,12 @@ def test_transform_aligns(func, series, expected_values, keys, keys_in_index): df = df.set_index(keys, append=True) gb = df.groupby(keys) - if series: + if frame_or_series is Series: gb = gb["b"] result = gb.transform(func) expected = DataFrame({"b": expected_values}, index=df.index) - if series: + if frame_or_series is Series: expected = expected["b"] tm.assert_equal(result, expected) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index e9193113d0220..22a5fd28efd8d 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -174,22 +174,24 @@ def test_extract_expand_capture_groups(any_string_dtype): tm.assert_frame_equal(result, expected) -def test_extract_expand_capture_groups_index(request, index, any_string_dtype): +def test_extract_expand_capture_groups_index(index, any_string_dtype): # https://github.com/pandas-dev/pandas/issues/6348 # not passing index to the extractor data = ["A1", "B2", "C"] - if len(index) < len(data): - request.node.add_marker(pytest.mark.xfail(reason="Index too short.")) + if len(index) == 0: + pytest.skip("Test requires len(index) > 0") + while len(index) < len(data): + index = index.repeat(2) index = index[: len(data)] - s = Series(data, index=index, dtype=any_string_dtype) + ser = Series(data, index=index, dtype=any_string_dtype) - result = s.str.extract(r"(\d)", expand=False) + result = ser.str.extract(r"(\d)", expand=False) expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype) tm.assert_series_equal(result, expected) - result = s.str.extract(r"(?P\D)(?P\d)?", expand=False) + result = ser.str.extract(r"(?P\D)(?P\d)?", expand=False) expected = DataFrame( [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"],