From f6ab662682f984ab79bbd6891c72f27d0ca63cae Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 15 Feb 2023 08:49:13 -0800
Subject: [PATCH] CLN: assorted comments

---
 pandas/_libs/internals.pyx                    |  1 +
 pandas/_libs/lib.pyi                          |  1 -
 pandas/_libs/lib.pyx                          | 25 +++-----
 pandas/_libs/parsers.pyx                      | 13 ++---
 pandas/core/frame.py                          |  2 +-
 pandas/core/groupby/generic.py                |  2 +-
 pandas/core/groupby/groupby.py                |  5 +-
 pandas/core/indexes/base.py                   | 38 ++++++------
 pandas/core/indexes/multi.py                  |  5 ++
 pandas/core/indexing.py                       |  3 +
 pandas/core/internals/blocks.py               | 50 ++++++++--------
 pandas/core/internals/concat.py               |  3 -
 pandas/core/internals/construction.py         |  9 +++
 pandas/core/internals/managers.py             |  2 +
 pandas/io/pytables.py                         |  4 ++
 pandas/io/sas/sas.pyx                         |  2 +-
 pandas/tests/extension/test_string.py         | 14 ++---
 .../tests/groupby/transform/test_transform.py | 58 +++++++++----------
 pandas/tests/strings/test_extract.py          | 14 +++--
 19 files changed, 124 insertions(+), 127 deletions(-)
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index b5ff69d92492f..51c2486ac5c1d 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -7,6 +7,7 @@ from cython cimport Py_ssize_t
 
 
 cdef extern from "Python.h":
+    # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
     Py_ssize_t PY_SSIZE_T_MAX
 
 import numpy as np
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 72b46d9e30684..1f6ae49b76adc 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -217,7 +217,6 @@ def count_level_2d(
     mask: np.ndarray,  # ndarray[uint8_t, ndim=2, cast=True],
     labels: np.ndarray,  # const intp_t[:]
     max_bin: int,
-    axis: int,
 ) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=2]
 def get_level_sorter(
     label: np.ndarray,  # const int64_t[:]
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index d79f7068effc3..7f718fde79dd1 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -921,29 +921,19 @@ def get_level_sorter(
 def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
                    const intp_t[:] labels,
                    Py_ssize_t max_bin,
-                   int axis):
+                   ):
     cdef:
         Py_ssize_t i, j, k, n
         ndarray[int64_t, ndim=2] counts
 
-    assert (axis == 0 or axis == 1)
     n, k = (<object>mask).shape
 
-    if axis == 0:
-        counts = np.zeros((max_bin, k), dtype="i8")
-        with nogil:
-            for i in range(n):
-                for j in range(k):
-                    if mask[i, j]:
-                        counts[labels[i], j] += 1
-
-    else:  # axis == 1
-        counts = np.zeros((n, max_bin), dtype="i8")
-        with nogil:
-            for i in range(n):
-                for j in range(k):
-                    if mask[i, j]:
-                        counts[i, labels[j]] += 1
+    counts = np.zeros((n, max_bin), dtype="i8")
+    with nogil:
+        for i in range(n):
+            for j in range(k):
+                if mask[i, j]:
+                    counts[i, labels[j]] += 1
 
     return counts
 
@@ -1710,6 +1700,7 @@ cdef class Validator:
 
     cdef bint is_valid_null(self, object value) except -1:
         return value is None or value is C_NA or util.is_nan(value)
+        # TODO: include decimal NA?
 
     cdef bint is_array_typed(self) except -1:
         return False
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 20a25afa6a51f..5bddaa61d3196 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -848,6 +848,9 @@ cdef class TextReader:
         with nogil:
             status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
 
+        self._check_tokenize_status(status)
+
+    cdef _check_tokenize_status(self, int status):
         if self.parser.warn_msg != NULL:
             print(PyUnicode_DecodeUTF8(
                 self.parser.warn_msg, strlen(self.parser.warn_msg),
@@ -879,15 +882,7 @@ cdef class TextReader:
             with nogil:
                 status = tokenize_all_rows(self.parser, self.encoding_errors)
 
-            if self.parser.warn_msg != NULL:
-                print(PyUnicode_DecodeUTF8(
-                    self.parser.warn_msg, strlen(self.parser.warn_msg),
-                    self.encoding_errors), file=sys.stderr)
-                free(self.parser.warn_msg)
-                self.parser.warn_msg = NULL
-
-            if status < 0:
-                raise_parser_error("Error tokenizing data", self.parser)
+            self._check_tokenize_status(status)
 
         if self.parser_start >= self.parser.lines:
             raise StopIteration
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 71dc3b523fca6..7e7aa93046430 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -7814,7 +7814,7 @@ def combine(
         if self.empty and len(other) == other_idxlen:
             return other.copy()
 
-        # sorts if possible
+        # sorts if possible; otherwise align above ensures that these are set-equal
         new_columns = this.columns.union(other.columns)
         do_fill = fill_value is not None
         result = {}
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index eecf292e4c3c8..799d9cf350513 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -1424,7 +1424,7 @@ def _wrap_applied_output_series(
         values: list[Series],
         not_indexed_same: bool,
         first_not_none,
-        key_index,
+        key_index: Index | None,
         is_transform: bool,
     ) -> DataFrame | Series:
         kwargs = first_not_none._construct_axes_dict()
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 8dcf7a0838349..bd7cdbdd40969 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1043,7 +1043,9 @@ def _concat_objects(
             # when the ax has duplicates
             # so we resort to this
             # GH 14776, 30667
+            # TODO: can we re-use e.g. _reindex_non_unique?
             if ax.has_duplicates and not result.axes[self.axis].equals(ax):
+                # e.g. test_category_order_transformer
                 target = algorithms.unique1d(ax._values)
                 indexer, _ = result.index.get_indexer_non_unique(target)
                 result = result.take(indexer, axis=self.axis)
@@ -1460,6 +1462,7 @@ def _agg_py_fallback(
         NotImplementedError.
         """
         # We get here with a) EADtypes and b) object dtype
+        assert alt is not None
 
         if values.ndim == 1:
             # For DataFrameGroupBy we only get here with ExtensionArray
@@ -1775,7 +1778,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
             else:
                 masked = mask & ~isna(bvalues)
 
-            counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
+            counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
             if is_series:
                 assert counted.ndim == 2
                 assert counted.shape[0] == 1
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 6031bdc62c38a..633d2c1ab30ac 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -78,7 +78,6 @@
     LossySetitemError,
     can_hold_element,
     common_dtype_categorical_compat,
-    ensure_dtype_can_hold_na,
     find_result_type,
     infer_dtype_from,
     maybe_cast_pointwise_result,
@@ -351,6 +350,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
         # can_use_libjoin assures sv and ov are ndarrays
         sv = cast(np.ndarray, sv)
         ov = cast(np.ndarray, ov)
+        # similar but not identical to ov.searchsorted(sv)
         return libjoin.left_join_indexer_unique(sv, ov)
 
     @final
@@ -3132,7 +3132,7 @@ def union(self, other, sort=None):
         if not is_dtype_equal(self.dtype, other.dtype):
             if (
                 isinstance(self, ABCMultiIndex)
-                and not is_object_dtype(unpack_nested_dtype(other))
+                and not is_object_dtype(_unpack_nested_dtype(other))
                 and len(other) > 0
             ):
                 raise NotImplementedError(
@@ -3213,6 +3213,8 @@ def _union(self, other: Index, sort):
             result_dups = algos.union_with_duplicates(self, other)
             return _maybe_try_sort(result_dups, sort)
 
+        # The rest of this method is analogous to Index._intersection_via_get_indexer
+
         # Self may have duplicates; other already checked as unique
         # find indexes of things in "other" that are not in "self"
         if self._index_as_unique:
@@ -3800,7 +3802,7 @@ def _should_partial_index(self, target: Index) -> bool:
                 return False
             # See https://github.com/pandas-dev/pandas/issues/47772 the commented
             # out code can be restored (instead of hardcoding `return True`)
-            # once that issue if fixed
+            # once that issue is fixed
             # "Index" has no attribute "left"
             # return self.left._should_compare(target)  # type: ignore[attr-defined]
             return True
@@ -4778,6 +4780,9 @@ def _join_monotonic(
         assert other.dtype == self.dtype
 
         if self.equals(other):
+            # This is a convenient place for this check, but its correctness
+            #  does not depend on monotonicity, so it could go earlier
+            #  in the calling method.
             ret_index = other if how == "right" else self
             return ret_index, None, None
 
@@ -5762,6 +5767,9 @@ def get_indexer_non_unique(
             that = target.astype(dtype, copy=False)
             return this.get_indexer_non_unique(that)
 
+        # TODO: get_indexer has fastpaths for both Categorical-self and
+        #  Categorical-target. Can we do something similar here?
+
         # Note: _maybe_promote ensures we never get here with MultiIndex
         #  self and non-Multi target
         tgt_values = target._get_engine_target()
@@ -5922,7 +5930,7 @@ def _get_indexer_non_comparable(
             If doing an inequality check, i.e. method is not None.
         """
         if method is not None:
-            other = unpack_nested_dtype(target)
+            other = _unpack_nested_dtype(target)
             raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
 
         no_matches = -1 * np.ones(target.shape, dtype=np.intp)
@@ -5998,16 +6006,6 @@ def _find_common_type_compat(self, target) -> DtypeObj:
         Implementation of find_common_type that adjusts for Index-specific
         special cases.
         """
-        if is_valid_na_for_dtype(target, self.dtype):
-            # e.g. setting NA value into IntervalArray[int64]
-            dtype = ensure_dtype_can_hold_na(self.dtype)
-            if is_dtype_equal(self.dtype, dtype):
-                raise NotImplementedError(
-                    "This should not be reached. Please report a bug at "
-                    "github.com/pandas-dev/pandas"
-                )
-            return dtype
-
         target_dtype, _ = infer_dtype_from(target, pandas_dtype=True)
 
         # special case: if one dtype is uint64 and the other a signed int, return object
@@ -6040,7 +6038,7 @@ def _should_compare(self, other: Index) -> bool:
             #  respectively.
             return False
 
-        other = unpack_nested_dtype(other)
+        other = _unpack_nested_dtype(other)
         dtype = other.dtype
         return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
 
@@ -6052,6 +6050,8 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
             return dtype.kind == "b"
         elif is_numeric_dtype(self.dtype):
             return is_numeric_dtype(dtype)
+        # TODO: this was written assuming we only get here with object-dtype,
+        #  which is nom longer correct. Can we specialize for EA?
         return True
 
     @final
@@ -7141,7 +7141,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
     return names
 
 
-def unpack_nested_dtype(other: _IndexT) -> _IndexT:
+def _unpack_nested_dtype(other: Index) -> Index:
     """
     When checking if our dtype is comparable with another, we need
     to unpack CategoricalDtype to look at its categories.dtype.
@@ -7155,12 +7155,10 @@ def unpack_nested_dtype(other: _IndexT) -> _IndexT:
     Index
     """
     dtype = other.dtype
-    if is_categorical_dtype(dtype):
+    if isinstance(dtype, CategoricalDtype):
         # If there is ever a SparseIndex, this could get dispatched
         #  here too.
-        # error: Item  "dtype[Any]"/"ExtensionDtype" of "Union[dtype[Any],
-        # ExtensionDtype]" has no attribute "categories"
-        return dtype.categories  # type: ignore[union-attr]
+        return dtype.categories
     return other
 
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 3d8948615f288..0d0fc4a779120 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -2746,6 +2746,7 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
         Index.get_loc : The get_loc method for (single-level) index.
         """
         if is_scalar(key) and isna(key):
+            # TODO: need is_valid_na_for_dtype(key, level_index.dtype)
             return -1
         else:
             return level_index.get_loc(key)
@@ -2818,6 +2819,8 @@ def _maybe_to_slice(loc):
             )
 
         if keylen == self.nlevels and self.is_unique:
+            # TODO: what if we have an IntervalIndex level?
+            #  i.e. do we need _index_as_unique on that level?
             try:
                 return self._engine.get_loc(key)
             except TypeError:
@@ -3853,6 +3856,8 @@ def maybe_droplevels(index: Index, key) -> Index:
     # drop levels
     original_index = index
     if isinstance(key, tuple):
+        # Caller is responsible for ensuring the key is not an entry in the first
+        #  level of the MultiIndex.
         for _ in key:
             try:
                 index = index._drop_level_numbers([0])
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index d981d8e097dbb..c1435ebbe39ef 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1779,6 +1779,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
                             self.obj[key] = empty_value
 
                         else:
+                            # FIXME: GH#42099#issuecomment-864326014
                             self.obj[key] = infer_fill_value(value)
 
                         new_indexer = convert_from_missing_indexer_tuple(
@@ -1866,6 +1867,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):
                 self._setitem_with_indexer_frame_value(indexer, value, name)
 
             elif np.ndim(value) == 2:
+                # TODO: avoid np.ndim call in case it isn't an ndarray, since
+                #  that will construct an ndarray, which will be wasteful
                 self._setitem_with_indexer_2d_value(indexer, value)
 
             elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index db5cb4a70c8f1..0658b2039f085 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -429,6 +429,7 @@ def _maybe_downcast(
             return blocks
 
         if self.dtype == _dtype_obj:
+            # TODO: does it matter that self.dtype might not match blocks[i].dtype?
             # GH#44241 We downcast regardless of the argument;
             #  respecting 'downcast=None' may be worthwhile at some point,
             #  but ATM it breaks too much existing code.
@@ -1817,39 +1818,38 @@ def _unwrap_setitem_indexer(self, indexer):
         """
         # TODO: ATM this doesn't work for iget/_slice, can we change that?
 
-        if isinstance(indexer, tuple):
+        if isinstance(indexer, tuple) and len(indexer) == 2:
             # TODO(EA2D): not needed with 2D EAs
             #  Should never have length > 2.  Caller is responsible for checking.
             #  Length 1 is reached vis setitem_single_block and setitem_single_column
             #  each of which pass indexer=(pi,)
-            if len(indexer) == 2:
-                if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
-                    # GH#44703 went through indexing.maybe_convert_ix
-                    first, second = indexer
-                    if not (
-                        second.size == 1 and (second == 0).all() and first.shape[1] == 1
-                    ):
-                        raise NotImplementedError(
-                            "This should not be reached. Please report a bug at "
-                            "github.com/pandas-dev/pandas/"
-                        )
-                    indexer = first[:, 0]
-
-                elif lib.is_integer(indexer[1]) and indexer[1] == 0:
-                    # reached via setitem_single_block passing the whole indexer
-                    indexer = indexer[0]
-
-                elif com.is_null_slice(indexer[1]):
-                    indexer = indexer[0]
-
-                elif is_list_like(indexer[1]) and indexer[1][0] == 0:
-                    indexer = indexer[0]
-
-                else:
+            if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
+                # GH#44703 went through indexing.maybe_convert_ix
+                first, second = indexer
+                if not (
+                    second.size == 1 and (second == 0).all() and first.shape[1] == 1
+                ):
                     raise NotImplementedError(
                         "This should not be reached. Please report a bug at "
                         "github.com/pandas-dev/pandas/"
                     )
+                indexer = first[:, 0]
+
+            elif lib.is_integer(indexer[1]) and indexer[1] == 0:
+                # reached via setitem_single_block passing the whole indexer
+                indexer = indexer[0]
+
+            elif com.is_null_slice(indexer[1]):
+                indexer = indexer[0]
+
+            elif is_list_like(indexer[1]) and indexer[1][0] == 0:
+                indexer = indexer[0]
+
+            else:
+                raise NotImplementedError(
+                    "This should not be reached. Please report a bug at "
+                    "github.com/pandas-dev/pandas/"
+                )
         return indexer
 
     @property
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 3c357bd7516c0..a33ce8fd5c459 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -596,9 +596,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
 
     elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
         # TODO(EA2D): special case not needed if all EAs used HybridBlocks
-        # NB: we are still assuming here that Hybrid blocks have shape (1, N)
-        # concatting with at least one EA means we are concatting a single column
-        # the non-EA values are 2D arrays with shape (1, n)
 
         # error: No overload variant of "__getitem__" of "ExtensionArray" matches
         # argument type "Tuple[int, slice]"
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 934d08341395c..4dbdd5e5b77fe 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -362,6 +362,7 @@ def ndarray_to_mgr(
         block_values = [nb]
 
     if len(columns) == 0:
+        # TODO: check len(values) == 0?
         block_values = []
 
     return create_block_manager_from_blocks(
@@ -506,6 +507,8 @@ def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:
     # We only get here with `not treat_as_nested(values)`
 
     if len(values) == 0:
+        # TODO: check for length-zero range, in which case return int64 dtype?
+        # TODO: re-use anything in try_cast?
         return np.empty((0, 0), dtype=object)
     elif isinstance(values, range):
         arr = range_to_ndarray(values)
@@ -1007,6 +1010,12 @@ def convert(arr):
                 try_float=coerce_float,
                 convert_to_nullable_dtype=use_nullable_dtypes,
             )
+            # Notes on cases that get here 2023-02-15
+            # 1) we DO get here when arr is all Timestamps and dtype=None
+            # 2) disabling this doesn't break the world, so this must be
+            #    getting caught at a higher level
+            # 3) passing convert_datetime to maybe_convert_objects get this right
+            # 4) convert_timedelta?
 
             if dtype is None:
                 if arr.dtype == np.dtype("O"):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 9de801b732544..ccfa5ae57b255 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -994,6 +994,8 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
         np.ndarray or ExtensionArray
         """
         if len(self.blocks) == 1:
+            # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;
+            #  is this ruled out in the general case?
             result = self.blocks[0].iget((slice(None), loc))
             # in the case of a single block, the new block is a view
             block = new_block(
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 4a95daafd82a9..7461bad99462c 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -4024,6 +4024,10 @@ def get_blk_items(mgr):
         blk_items: list[Index] = get_blk_items(mgr)
 
         if len(data_columns):
+            # TODO: prove that we only get here with axis == 1?
+            #  It is the case in all extant tests, but NOT the case
+            #  outside this `if len(data_columns)` check.
+
             axis, axis_labels = new_non_index_axes[0]
             new_labels = Index(axis_labels).difference(Index(data_columns))
             mgr = frame.reindex(new_labels, axis=axis)._mgr
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 4fe0f5ce91a51..6669686d7aa2c 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -1,5 +1,5 @@
 # cython: language_level=3, initializedcheck=False
-# cython: warn.undeclared=True, warn.maybe_uninitialized=True, warn.unused=True
+# cython: warn.maybe_uninitialized=True, warn.unused=True
 from cython cimport Py_ssize_t
 from libc.stddef cimport size_t
 from libc.stdint cimport (
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 0743c1e26c62f..ee855bb1cde8c 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -113,8 +113,7 @@ def test_is_not_string_type(self, dtype):
 class TestInterface(base.BaseInterfaceTests):
     def test_view(self, data, request):
         if data.dtype.storage == "pyarrow":
-            mark = pytest.mark.xfail(reason="not implemented")
-            request.node.add_marker(mark)
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_view(data)
 
 
@@ -134,8 +133,7 @@ def test_constructor_from_list(self):
 class TestReshaping(base.BaseReshapingTests):
     def test_transpose(self, data, request):
         if data.dtype.storage == "pyarrow":
-            mark = pytest.mark.xfail(reason="not implemented")
-            request.node.add_marker(mark)
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_transpose(data)
 
 
@@ -146,8 +144,7 @@ class TestGetitem(base.BaseGetitemTests):
 class TestSetitem(base.BaseSetitemTests):
     def test_setitem_preserves_views(self, data, request):
         if data.dtype.storage == "pyarrow":
-            mark = pytest.mark.xfail(reason="not implemented")
-            request.node.add_marker(mark)
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_setitem_preserves_views(data)
 
 
@@ -391,10 +388,7 @@ class Test2DCompat(base.Dim2CompatTests):
     @pytest.fixture(autouse=True)
     def arrow_not_supported(self, data, request):
         if isinstance(data, ArrowStringArray):
-            mark = pytest.mark.xfail(
-                reason="2D support not implemented for ArrowStringArray"
-            )
-            request.node.add_marker(mark)
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
 
 
 def test_searchsorted_with_na_raises(data_for_sorting, as_series):
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 7ead3890c5130..8abcc52db0500 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -892,7 +892,6 @@ def test_pad_stable_sorting(fill_method):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("test_series", [True, False])
 @pytest.mark.parametrize(
     "freq",
     [
@@ -908,8 +907,8 @@ def test_pad_stable_sorting(fill_method):
 @pytest.mark.parametrize("periods", [1, -1])
 @pytest.mark.parametrize("fill_method", ["ffill", "bfill", None])
 @pytest.mark.parametrize("limit", [None, 1])
-def test_pct_change(test_series, freq, periods, fill_method, limit):
-    # GH  21200, 21621, 30463
+def test_pct_change(frame_or_series, freq, periods, fill_method, limit):
+    # GH 21200, 21621, 30463
     vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
     keys = ["a", "b"]
     key_v = np.repeat(keys, len(vals))
@@ -922,16 +921,17 @@ def test_pct_change(test_series, freq, periods, fill_method, limit):
 
     expected = grp["vals"].obj / grp["vals"].shift(periods) - 1
 
-    if test_series:
-        result = df.groupby("key")["vals"].pct_change(
-            periods=periods, fill_method=fill_method, limit=limit, freq=freq
-        )
-        tm.assert_series_equal(result, expected)
+    gb = df.groupby("key")
+
+    if frame_or_series is Series:
+        gb = gb["vals"]
     else:
-        result = df.groupby("key").pct_change(
-            periods=periods, fill_method=fill_method, limit=limit, freq=freq
-        )
-        tm.assert_frame_equal(result, expected.to_frame("vals"))
+        expected = expected.to_frame("vals")
+
+    result = gb.pct_change(
+        periods=periods, fill_method=fill_method, limit=limit, freq=freq
+    )
+    tm.assert_equal(result, expected)
 
 
 @pytest.mark.parametrize(
@@ -1096,19 +1096,16 @@ def test_transform_invalid_name_raises():
         g.transform("some_arbitrary_name")
 
 
-@pytest.mark.parametrize(
-    "obj",
-    [
-        DataFrame(
-            {"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
-            index=["A", "B", "C", "D", "E", "F"],
-        ),
-        Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]),
-    ],
-)
-def test_transform_agg_by_name(request, reduction_func, obj):
+def test_transform_agg_by_name(request, reduction_func, frame_or_series):
     func = reduction_func
 
+    obj = DataFrame(
+        {"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
+        index=["A", "B", "C", "D", "E", "F"],
+    )
+    if frame_or_series is Series:
+        obj = obj["a"]
+
     g = obj.groupby(np.repeat([0, 1], 3))
 
     if func == "corrwith" and isinstance(obj, Series):  # GH#32293
@@ -1444,18 +1441,15 @@ def test_null_group_str_transformer_series(dropna, transformation_func):
 
 
 @pytest.mark.parametrize(
-    "func, series, expected_values",
+    "func, expected_values",
     [
-        (Series.sort_values, False, [5, 4, 3, 2, 1]),
-        (lambda x: x.head(1), False, [5.0, np.nan, 3, 2, np.nan]),
-        # SeriesGroupBy already has correct behavior
-        (Series.sort_values, True, [5, 4, 3, 2, 1]),
-        (lambda x: x.head(1), True, [5.0, np.nan, 3.0, 2.0, np.nan]),
+        (Series.sort_values, [5, 4, 3, 2, 1]),
+        (lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]),
     ],
 )
 @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
 @pytest.mark.parametrize("keys_in_index", [True, False])
-def test_transform_aligns(func, series, expected_values, keys, keys_in_index):
+def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index):
     # GH#45648 - transform should align with the input's index
     df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]})
     if "a2" in keys:
@@ -1464,12 +1458,12 @@ def test_transform_aligns(func, series, expected_values, keys, keys_in_index):
         df = df.set_index(keys, append=True)
 
     gb = df.groupby(keys)
-    if series:
+    if frame_or_series is Series:
         gb = gb["b"]
 
     result = gb.transform(func)
     expected = DataFrame({"b": expected_values}, index=df.index)
-    if series:
+    if frame_or_series is Series:
         expected = expected["b"]
     tm.assert_equal(result, expected)
 
diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py
index e9193113d0220..22a5fd28efd8d 100644
--- a/pandas/tests/strings/test_extract.py
+++ b/pandas/tests/strings/test_extract.py
@@ -174,22 +174,24 @@ def test_extract_expand_capture_groups(any_string_dtype):
     tm.assert_frame_equal(result, expected)
 
 
-def test_extract_expand_capture_groups_index(request, index, any_string_dtype):
+def test_extract_expand_capture_groups_index(index, any_string_dtype):
     # https://github.com/pandas-dev/pandas/issues/6348
     # not passing index to the extractor
     data = ["A1", "B2", "C"]
 
-    if len(index) < len(data):
-        request.node.add_marker(pytest.mark.xfail(reason="Index too short."))
+    if len(index) == 0:
+        pytest.skip("Test requires len(index) > 0")
+    while len(index) < len(data):
+        index = index.repeat(2)
 
     index = index[: len(data)]
-    s = Series(data, index=index, dtype=any_string_dtype)
+    ser = Series(data, index=index, dtype=any_string_dtype)
 
-    result = s.str.extract(r"(\d)", expand=False)
+    result = ser.str.extract(r"(\d)", expand=False)
     expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
-    result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
+    result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
     expected = DataFrame(
         [["A", "1"], ["B", "2"], ["C", np.nan]],
         columns=["letter", "number"],