Skip to content

Commit 5700ada

Browse files
authored
CLN: assorted comments (#51410)
1 parent 6d5e78e commit 5700ada

File tree

19 files changed

+124
-127
lines changed

19 files changed

+124
-127
lines changed

pandas/_libs/internals.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ from cython cimport Py_ssize_t
77

88

99
cdef extern from "Python.h":
10+
# TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
1011
Py_ssize_t PY_SSIZE_T_MAX
1112

1213
import numpy as np

pandas/_libs/lib.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ def count_level_2d(
217217
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
218218
labels: np.ndarray, # const intp_t[:]
219219
max_bin: int,
220-
axis: int,
221220
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
222221
def get_level_sorter(
223222
label: np.ndarray, # const int64_t[:]

pandas/_libs/lib.pyx

+8-17
Original file line numberDiff line numberDiff line change
@@ -924,29 +924,19 @@ def get_level_sorter(
924924
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
925925
const intp_t[:] labels,
926926
Py_ssize_t max_bin,
927-
int axis):
927+
):
928928
cdef:
929929
Py_ssize_t i, j, k, n
930930
ndarray[int64_t, ndim=2] counts
931931

932-
assert (axis == 0 or axis == 1)
933932
n, k = (<object>mask).shape
934933

935-
if axis == 0:
936-
counts = np.zeros((max_bin, k), dtype="i8")
937-
with nogil:
938-
for i in range(n):
939-
for j in range(k):
940-
if mask[i, j]:
941-
counts[labels[i], j] += 1
942-
943-
else: # axis == 1
944-
counts = np.zeros((n, max_bin), dtype="i8")
945-
with nogil:
946-
for i in range(n):
947-
for j in range(k):
948-
if mask[i, j]:
949-
counts[i, labels[j]] += 1
934+
counts = np.zeros((n, max_bin), dtype="i8")
935+
with nogil:
936+
for i in range(n):
937+
for j in range(k):
938+
if mask[i, j]:
939+
counts[i, labels[j]] += 1
950940

951941
return counts
952942

@@ -1713,6 +1703,7 @@ cdef class Validator:
17131703

17141704
cdef bint is_valid_null(self, object value) except -1:
17151705
return value is None or value is C_NA or util.is_nan(value)
1706+
# TODO: include decimal NA?
17161707

17171708
cdef bint is_array_typed(self) except -1:
17181709
return False

pandas/_libs/parsers.pyx

+4-9
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,9 @@ cdef class TextReader:
848848
with nogil:
849849
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
850850

851+
self._check_tokenize_status(status)
852+
853+
cdef _check_tokenize_status(self, int status):
851854
if self.parser.warn_msg != NULL:
852855
print(PyUnicode_DecodeUTF8(
853856
self.parser.warn_msg, strlen(self.parser.warn_msg),
@@ -879,15 +882,7 @@ cdef class TextReader:
879882
with nogil:
880883
status = tokenize_all_rows(self.parser, self.encoding_errors)
881884

882-
if self.parser.warn_msg != NULL:
883-
print(PyUnicode_DecodeUTF8(
884-
self.parser.warn_msg, strlen(self.parser.warn_msg),
885-
self.encoding_errors), file=sys.stderr)
886-
free(self.parser.warn_msg)
887-
self.parser.warn_msg = NULL
888-
889-
if status < 0:
890-
raise_parser_error("Error tokenizing data", self.parser)
885+
self._check_tokenize_status(status)
891886

892887
if self.parser_start >= self.parser.lines:
893888
raise StopIteration

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7814,7 +7814,7 @@ def combine(
78147814
if self.empty and len(other) == other_idxlen:
78157815
return other.copy()
78167816

7817-
# sorts if possible
7817+
# sorts if possible; otherwise align above ensures that these are set-equal
78187818
new_columns = this.columns.union(other.columns)
78197819
do_fill = fill_value is not None
78207820
result = {}

pandas/core/groupby/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,7 @@ def _wrap_applied_output_series(
14241424
values: list[Series],
14251425
not_indexed_same: bool,
14261426
first_not_none,
1427-
key_index,
1427+
key_index: Index | None,
14281428
is_transform: bool,
14291429
) -> DataFrame | Series:
14301430
kwargs = first_not_none._construct_axes_dict()

pandas/core/groupby/groupby.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,9 @@ def _concat_objects(
10431043
# when the ax has duplicates
10441044
# so we resort to this
10451045
# GH 14776, 30667
1046+
# TODO: can we re-use e.g. _reindex_non_unique?
10461047
if ax.has_duplicates and not result.axes[self.axis].equals(ax):
1048+
# e.g. test_category_order_transformer
10471049
target = algorithms.unique1d(ax._values)
10481050
indexer, _ = result.index.get_indexer_non_unique(target)
10491051
result = result.take(indexer, axis=self.axis)
@@ -1460,6 +1462,7 @@ def _agg_py_fallback(
14601462
NotImplementedError.
14611463
"""
14621464
# We get here with a) EADtypes and b) object dtype
1465+
assert alt is not None
14631466

14641467
if values.ndim == 1:
14651468
# For DataFrameGroupBy we only get here with ExtensionArray
@@ -1775,7 +1778,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
17751778
else:
17761779
masked = mask & ~isna(bvalues)
17771780

1778-
counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
1781+
counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
17791782
if is_series:
17801783
assert counted.ndim == 2
17811784
assert counted.shape[0] == 1

pandas/core/indexes/base.py

+18-20
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@
7878
LossySetitemError,
7979
can_hold_element,
8080
common_dtype_categorical_compat,
81-
ensure_dtype_can_hold_na,
8281
find_result_type,
8382
infer_dtype_from,
8483
maybe_cast_pointwise_result,
@@ -351,6 +350,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
351350
# can_use_libjoin assures sv and ov are ndarrays
352351
sv = cast(np.ndarray, sv)
353352
ov = cast(np.ndarray, ov)
353+
# similar but not identical to ov.searchsorted(sv)
354354
return libjoin.left_join_indexer_unique(sv, ov)
355355

356356
@final
@@ -3128,7 +3128,7 @@ def union(self, other, sort=None):
31283128
if not is_dtype_equal(self.dtype, other.dtype):
31293129
if (
31303130
isinstance(self, ABCMultiIndex)
3131-
and not is_object_dtype(unpack_nested_dtype(other))
3131+
and not is_object_dtype(_unpack_nested_dtype(other))
31323132
and len(other) > 0
31333133
):
31343134
raise NotImplementedError(
@@ -3209,6 +3209,8 @@ def _union(self, other: Index, sort):
32093209
result_dups = algos.union_with_duplicates(self, other)
32103210
return _maybe_try_sort(result_dups, sort)
32113211

3212+
# The rest of this method is analogous to Index._intersection_via_get_indexer
3213+
32123214
# Self may have duplicates; other already checked as unique
32133215
# find indexes of things in "other" that are not in "self"
32143216
if self._index_as_unique:
@@ -3796,7 +3798,7 @@ def _should_partial_index(self, target: Index) -> bool:
37963798
return False
37973799
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
37983800
# out code can be restored (instead of hardcoding `return True`)
3799-
# once that issue if fixed
3801+
# once that issue is fixed
38003802
# "Index" has no attribute "left"
38013803
# return self.left._should_compare(target) # type: ignore[attr-defined]
38023804
return True
@@ -4774,6 +4776,9 @@ def _join_monotonic(
47744776
assert other.dtype == self.dtype
47754777

47764778
if self.equals(other):
4779+
# This is a convenient place for this check, but its correctness
4780+
# does not depend on monotonicity, so it could go earlier
4781+
# in the calling method.
47774782
ret_index = other if how == "right" else self
47784783
return ret_index, None, None
47794784

@@ -5758,6 +5763,9 @@ def get_indexer_non_unique(
57585763
that = target.astype(dtype, copy=False)
57595764
return this.get_indexer_non_unique(that)
57605765

5766+
# TODO: get_indexer has fastpaths for both Categorical-self and
5767+
# Categorical-target. Can we do something similar here?
5768+
57615769
# Note: _maybe_promote ensures we never get here with MultiIndex
57625770
# self and non-Multi target
57635771
tgt_values = target._get_engine_target()
@@ -5918,7 +5926,7 @@ def _get_indexer_non_comparable(
59185926
If doing an inequality check, i.e. method is not None.
59195927
"""
59205928
if method is not None:
5921-
other = unpack_nested_dtype(target)
5929+
other = _unpack_nested_dtype(target)
59225930
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
59235931

59245932
no_matches = -1 * np.ones(target.shape, dtype=np.intp)
@@ -5994,16 +6002,6 @@ def _find_common_type_compat(self, target) -> DtypeObj:
59946002
Implementation of find_common_type that adjusts for Index-specific
59956003
special cases.
59966004
"""
5997-
if is_valid_na_for_dtype(target, self.dtype):
5998-
# e.g. setting NA value into IntervalArray[int64]
5999-
dtype = ensure_dtype_can_hold_na(self.dtype)
6000-
if is_dtype_equal(self.dtype, dtype):
6001-
raise NotImplementedError(
6002-
"This should not be reached. Please report a bug at "
6003-
"github.com/pandas-dev/pandas"
6004-
)
6005-
return dtype
6006-
60076005
target_dtype, _ = infer_dtype_from(target, pandas_dtype=True)
60086006

60096007
# special case: if one dtype is uint64 and the other a signed int, return object
@@ -6036,7 +6034,7 @@ def _should_compare(self, other: Index) -> bool:
60366034
# respectively.
60376035
return False
60386036

6039-
other = unpack_nested_dtype(other)
6037+
other = _unpack_nested_dtype(other)
60406038
dtype = other.dtype
60416039
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
60426040

@@ -6048,6 +6046,8 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
60486046
return dtype.kind == "b"
60496047
elif is_numeric_dtype(self.dtype):
60506048
return is_numeric_dtype(dtype)
6049+
# TODO: this was written assuming we only get here with object-dtype,
6050+
# which is nom longer correct. Can we specialize for EA?
60516051
return True
60526052

60536053
@final
@@ -7135,7 +7135,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
71357135
return names
71367136

71377137

7138-
def unpack_nested_dtype(other: _IndexT) -> _IndexT:
7138+
def _unpack_nested_dtype(other: Index) -> Index:
71397139
"""
71407140
When checking if our dtype is comparable with another, we need
71417141
to unpack CategoricalDtype to look at its categories.dtype.
@@ -7149,12 +7149,10 @@ def unpack_nested_dtype(other: _IndexT) -> _IndexT:
71497149
Index
71507150
"""
71517151
dtype = other.dtype
7152-
if is_categorical_dtype(dtype):
7152+
if isinstance(dtype, CategoricalDtype):
71537153
# If there is ever a SparseIndex, this could get dispatched
71547154
# here too.
7155-
# error: Item "dtype[Any]"/"ExtensionDtype" of "Union[dtype[Any],
7156-
# ExtensionDtype]" has no attribute "categories"
7157-
return dtype.categories # type: ignore[union-attr]
7155+
return dtype.categories
71587156
return other
71597157

71607158

pandas/core/indexes/multi.py

+5
Original file line numberDiff line numberDiff line change
@@ -2746,6 +2746,7 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
27462746
Index.get_loc : The get_loc method for (single-level) index.
27472747
"""
27482748
if is_scalar(key) and isna(key):
2749+
# TODO: need is_valid_na_for_dtype(key, level_index.dtype)
27492750
return -1
27502751
else:
27512752
return level_index.get_loc(key)
@@ -2818,6 +2819,8 @@ def _maybe_to_slice(loc):
28182819
)
28192820

28202821
if keylen == self.nlevels and self.is_unique:
2822+
# TODO: what if we have an IntervalIndex level?
2823+
# i.e. do we need _index_as_unique on that level?
28212824
try:
28222825
return self._engine.get_loc(key)
28232826
except TypeError:
@@ -3853,6 +3856,8 @@ def maybe_droplevels(index: Index, key) -> Index:
38533856
# drop levels
38543857
original_index = index
38553858
if isinstance(key, tuple):
3859+
# Caller is responsible for ensuring the key is not an entry in the first
3860+
# level of the MultiIndex.
38563861
for _ in key:
38573862
try:
38583863
index = index._drop_level_numbers([0])

pandas/core/indexing.py

+3
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
17791779
self.obj[key] = empty_value
17801780

17811781
else:
1782+
# FIXME: GH#42099#issuecomment-864326014
17821783
self.obj[key] = infer_fill_value(value)
17831784

17841785
new_indexer = convert_from_missing_indexer_tuple(
@@ -1866,6 +1867,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):
18661867
self._setitem_with_indexer_frame_value(indexer, value, name)
18671868

18681869
elif np.ndim(value) == 2:
1870+
# TODO: avoid np.ndim call in case it isn't an ndarray, since
1871+
# that will construct an ndarray, which will be wasteful
18691872
self._setitem_with_indexer_2d_value(indexer, value)
18701873

18711874
elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):

pandas/core/internals/blocks.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ def _maybe_downcast(
429429
return blocks
430430

431431
if self.dtype == _dtype_obj:
432+
# TODO: does it matter that self.dtype might not match blocks[i].dtype?
432433
# GH#44241 We downcast regardless of the argument;
433434
# respecting 'downcast=None' may be worthwhile at some point,
434435
# but ATM it breaks too much existing code.
@@ -1817,39 +1818,38 @@ def _unwrap_setitem_indexer(self, indexer):
18171818
"""
18181819
# TODO: ATM this doesn't work for iget/_slice, can we change that?
18191820

1820-
if isinstance(indexer, tuple):
1821+
if isinstance(indexer, tuple) and len(indexer) == 2:
18211822
# TODO(EA2D): not needed with 2D EAs
18221823
# Should never have length > 2. Caller is responsible for checking.
18231824
# Length 1 is reached vis setitem_single_block and setitem_single_column
18241825
# each of which pass indexer=(pi,)
1825-
if len(indexer) == 2:
1826-
if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
1827-
# GH#44703 went through indexing.maybe_convert_ix
1828-
first, second = indexer
1829-
if not (
1830-
second.size == 1 and (second == 0).all() and first.shape[1] == 1
1831-
):
1832-
raise NotImplementedError(
1833-
"This should not be reached. Please report a bug at "
1834-
"github.com/pandas-dev/pandas/"
1835-
)
1836-
indexer = first[:, 0]
1837-
1838-
elif lib.is_integer(indexer[1]) and indexer[1] == 0:
1839-
# reached via setitem_single_block passing the whole indexer
1840-
indexer = indexer[0]
1841-
1842-
elif com.is_null_slice(indexer[1]):
1843-
indexer = indexer[0]
1844-
1845-
elif is_list_like(indexer[1]) and indexer[1][0] == 0:
1846-
indexer = indexer[0]
1847-
1848-
else:
1826+
if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
1827+
# GH#44703 went through indexing.maybe_convert_ix
1828+
first, second = indexer
1829+
if not (
1830+
second.size == 1 and (second == 0).all() and first.shape[1] == 1
1831+
):
18491832
raise NotImplementedError(
18501833
"This should not be reached. Please report a bug at "
18511834
"github.com/pandas-dev/pandas/"
18521835
)
1836+
indexer = first[:, 0]
1837+
1838+
elif lib.is_integer(indexer[1]) and indexer[1] == 0:
1839+
# reached via setitem_single_block passing the whole indexer
1840+
indexer = indexer[0]
1841+
1842+
elif com.is_null_slice(indexer[1]):
1843+
indexer = indexer[0]
1844+
1845+
elif is_list_like(indexer[1]) and indexer[1][0] == 0:
1846+
indexer = indexer[0]
1847+
1848+
else:
1849+
raise NotImplementedError(
1850+
"This should not be reached. Please report a bug at "
1851+
"github.com/pandas-dev/pandas/"
1852+
)
18531853
return indexer
18541854

18551855
@property

pandas/core/internals/concat.py

-3
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
596596

597597
elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
598598
# TODO(EA2D): special case not needed if all EAs used HybridBlocks
599-
# NB: we are still assuming here that Hybrid blocks have shape (1, N)
600-
# concatting with at least one EA means we are concatting a single column
601-
# the non-EA values are 2D arrays with shape (1, n)
602599

603600
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
604601
# argument type "Tuple[int, slice]"

0 commit comments

Comments
 (0)