Skip to content

CLN: assorted comments #51410

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from cython cimport Py_ssize_t


cdef extern from "Python.h":
# TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
Py_ssize_t PY_SSIZE_T_MAX

import numpy as np
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ def count_level_2d(
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
labels: np.ndarray, # const intp_t[:]
max_bin: int,
axis: int,
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
def get_level_sorter(
label: np.ndarray, # const int64_t[:]
Expand Down
25 changes: 8 additions & 17 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -921,29 +921,19 @@ def get_level_sorter(
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
const intp_t[:] labels,
Py_ssize_t max_bin,
int axis):
):
cdef:
Py_ssize_t i, j, k, n
ndarray[int64_t, ndim=2] counts

assert (axis == 0 or axis == 1)
n, k = (<object>mask).shape

if axis == 0:
counts = np.zeros((max_bin, k), dtype="i8")
with nogil:
for i in range(n):
for j in range(k):
if mask[i, j]:
counts[labels[i], j] += 1

else: # axis == 1
counts = np.zeros((n, max_bin), dtype="i8")
with nogil:
for i in range(n):
for j in range(k):
if mask[i, j]:
counts[i, labels[j]] += 1
counts = np.zeros((n, max_bin), dtype="i8")
with nogil:
for i in range(n):
for j in range(k):
if mask[i, j]:
counts[i, labels[j]] += 1

return counts

Expand Down Expand Up @@ -1710,6 +1700,7 @@ cdef class Validator:

cdef bint is_valid_null(self, object value) except -1:
return value is None or value is C_NA or util.is_nan(value)
# TODO: include decimal NA?

cdef bint is_array_typed(self) except -1:
return False
Expand Down
13 changes: 4 additions & 9 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,9 @@ cdef class TextReader:
with nogil:
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)

self._check_tokenize_status(status)

cdef _check_tokenize_status(self, int status):
if self.parser.warn_msg != NULL:
print(PyUnicode_DecodeUTF8(
self.parser.warn_msg, strlen(self.parser.warn_msg),
Expand Down Expand Up @@ -879,15 +882,7 @@ cdef class TextReader:
with nogil:
status = tokenize_all_rows(self.parser, self.encoding_errors)

if self.parser.warn_msg != NULL:
print(PyUnicode_DecodeUTF8(
self.parser.warn_msg, strlen(self.parser.warn_msg),
self.encoding_errors), file=sys.stderr)
free(self.parser.warn_msg)
self.parser.warn_msg = NULL

if status < 0:
raise_parser_error("Error tokenizing data", self.parser)
self._check_tokenize_status(status)

if self.parser_start >= self.parser.lines:
raise StopIteration
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7814,7 +7814,7 @@ def combine(
if self.empty and len(other) == other_idxlen:
return other.copy()

# sorts if possible
# sorts if possible; otherwise align above ensures that these are set-equal
new_columns = this.columns.union(other.columns)
do_fill = fill_value is not None
result = {}
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1424,7 +1424,7 @@ def _wrap_applied_output_series(
values: list[Series],
not_indexed_same: bool,
first_not_none,
key_index,
key_index: Index | None,
is_transform: bool,
) -> DataFrame | Series:
kwargs = first_not_none._construct_axes_dict()
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,9 @@ def _concat_objects(
# when the ax has duplicates
# so we resort to this
# GH 14776, 30667
# TODO: can we re-use e.g. _reindex_non_unique?
if ax.has_duplicates and not result.axes[self.axis].equals(ax):
# e.g. test_category_order_transformer
target = algorithms.unique1d(ax._values)
indexer, _ = result.index.get_indexer_non_unique(target)
result = result.take(indexer, axis=self.axis)
Expand Down Expand Up @@ -1460,6 +1462,7 @@ def _agg_py_fallback(
NotImplementedError.
"""
# We get here with a) EADtypes and b) object dtype
assert alt is not None

if values.ndim == 1:
# For DataFrameGroupBy we only get here with ExtensionArray
Expand Down Expand Up @@ -1775,7 +1778,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
else:
masked = mask & ~isna(bvalues)

counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
if is_series:
assert counted.ndim == 2
assert counted.shape[0] == 1
Expand Down
38 changes: 18 additions & 20 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@
LossySetitemError,
can_hold_element,
common_dtype_categorical_compat,
ensure_dtype_can_hold_na,
find_result_type,
infer_dtype_from,
maybe_cast_pointwise_result,
Expand Down Expand Up @@ -351,6 +350,7 @@ def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
# can_use_libjoin assures sv and ov are ndarrays
sv = cast(np.ndarray, sv)
ov = cast(np.ndarray, ov)
# similar but not identical to ov.searchsorted(sv)
return libjoin.left_join_indexer_unique(sv, ov)

@final
Expand Down Expand Up @@ -3132,7 +3132,7 @@ def union(self, other, sort=None):
if not is_dtype_equal(self.dtype, other.dtype):
if (
isinstance(self, ABCMultiIndex)
and not is_object_dtype(unpack_nested_dtype(other))
and not is_object_dtype(_unpack_nested_dtype(other))
and len(other) > 0
):
raise NotImplementedError(
Expand Down Expand Up @@ -3213,6 +3213,8 @@ def _union(self, other: Index, sort):
result_dups = algos.union_with_duplicates(self, other)
return _maybe_try_sort(result_dups, sort)

# The rest of this method is analogous to Index._intersection_via_get_indexer

# Self may have duplicates; other already checked as unique
# find indexes of things in "other" that are not in "self"
if self._index_as_unique:
Expand Down Expand Up @@ -3800,7 +3802,7 @@ def _should_partial_index(self, target: Index) -> bool:
return False
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
# out code can be restored (instead of hardcoding `return True`)
# once that issue if fixed
# once that issue is fixed
# "Index" has no attribute "left"
# return self.left._should_compare(target) # type: ignore[attr-defined]
return True
Expand Down Expand Up @@ -4778,6 +4780,9 @@ def _join_monotonic(
assert other.dtype == self.dtype

if self.equals(other):
# This is a convenient place for this check, but its correctness
# does not depend on monotonicity, so it could go earlier
# in the calling method.
ret_index = other if how == "right" else self
return ret_index, None, None

Expand Down Expand Up @@ -5762,6 +5767,9 @@ def get_indexer_non_unique(
that = target.astype(dtype, copy=False)
return this.get_indexer_non_unique(that)

# TODO: get_indexer has fastpaths for both Categorical-self and
# Categorical-target. Can we do something similar here?

# Note: _maybe_promote ensures we never get here with MultiIndex
# self and non-Multi target
tgt_values = target._get_engine_target()
Expand Down Expand Up @@ -5922,7 +5930,7 @@ def _get_indexer_non_comparable(
If doing an inequality check, i.e. method is not None.
"""
if method is not None:
other = unpack_nested_dtype(target)
other = _unpack_nested_dtype(target)
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")

no_matches = -1 * np.ones(target.shape, dtype=np.intp)
Expand Down Expand Up @@ -5998,16 +6006,6 @@ def _find_common_type_compat(self, target) -> DtypeObj:
Implementation of find_common_type that adjusts for Index-specific
special cases.
"""
if is_valid_na_for_dtype(target, self.dtype):
# e.g. setting NA value into IntervalArray[int64]
dtype = ensure_dtype_can_hold_na(self.dtype)
if is_dtype_equal(self.dtype, dtype):
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas"
)
return dtype

target_dtype, _ = infer_dtype_from(target, pandas_dtype=True)

# special case: if one dtype is uint64 and the other a signed int, return object
Expand Down Expand Up @@ -6040,7 +6038,7 @@ def _should_compare(self, other: Index) -> bool:
# respectively.
return False

other = unpack_nested_dtype(other)
other = _unpack_nested_dtype(other)
dtype = other.dtype
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)

Expand All @@ -6052,6 +6050,8 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
return dtype.kind == "b"
elif is_numeric_dtype(self.dtype):
return is_numeric_dtype(dtype)
# TODO: this was written assuming we only get here with object-dtype,
# which is nom longer correct. Can we specialize for EA?
return True

@final
Expand Down Expand Up @@ -7141,7 +7141,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
return names


def unpack_nested_dtype(other: _IndexT) -> _IndexT:
def _unpack_nested_dtype(other: Index) -> Index:
"""
When checking if our dtype is comparable with another, we need
to unpack CategoricalDtype to look at its categories.dtype.
Expand All @@ -7155,12 +7155,10 @@ def unpack_nested_dtype(other: _IndexT) -> _IndexT:
Index
"""
dtype = other.dtype
if is_categorical_dtype(dtype):
if isinstance(dtype, CategoricalDtype):
# If there is ever a SparseIndex, this could get dispatched
# here too.
# error: Item "dtype[Any]"/"ExtensionDtype" of "Union[dtype[Any],
# ExtensionDtype]" has no attribute "categories"
return dtype.categories # type: ignore[union-attr]
return dtype.categories
return other


Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2746,6 +2746,7 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
Index.get_loc : The get_loc method for (single-level) index.
"""
if is_scalar(key) and isna(key):
# TODO: need is_valid_na_for_dtype(key, level_index.dtype)
return -1
else:
return level_index.get_loc(key)
Expand Down Expand Up @@ -2818,6 +2819,8 @@ def _maybe_to_slice(loc):
)

if keylen == self.nlevels and self.is_unique:
# TODO: what if we have an IntervalIndex level?
# i.e. do we need _index_as_unique on that level?
try:
return self._engine.get_loc(key)
except TypeError:
Expand Down Expand Up @@ -3853,6 +3856,8 @@ def maybe_droplevels(index: Index, key) -> Index:
# drop levels
original_index = index
if isinstance(key, tuple):
# Caller is responsible for ensuring the key is not an entry in the first
# level of the MultiIndex.
for _ in key:
try:
index = index._drop_level_numbers([0])
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1779,6 +1779,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
self.obj[key] = empty_value

else:
# FIXME: GH#42099#issuecomment-864326014
self.obj[key] = infer_fill_value(value)

new_indexer = convert_from_missing_indexer_tuple(
Expand Down Expand Up @@ -1866,6 +1867,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str):
self._setitem_with_indexer_frame_value(indexer, value, name)

elif np.ndim(value) == 2:
# TODO: avoid np.ndim call in case it isn't an ndarray, since
# that will construct an ndarray, which will be wasteful
self._setitem_with_indexer_2d_value(indexer, value)

elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
Expand Down
50 changes: 25 additions & 25 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ def _maybe_downcast(
return blocks

if self.dtype == _dtype_obj:
# TODO: does it matter that self.dtype might not match blocks[i].dtype?
# GH#44241 We downcast regardless of the argument;
# respecting 'downcast=None' may be worthwhile at some point,
# but ATM it breaks too much existing code.
Expand Down Expand Up @@ -1817,39 +1818,38 @@ def _unwrap_setitem_indexer(self, indexer):
"""
# TODO: ATM this doesn't work for iget/_slice, can we change that?

if isinstance(indexer, tuple):
if isinstance(indexer, tuple) and len(indexer) == 2:
# TODO(EA2D): not needed with 2D EAs
# Should never have length > 2. Caller is responsible for checking.
# Length 1 is reached vis setitem_single_block and setitem_single_column
# each of which pass indexer=(pi,)
if len(indexer) == 2:
if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
# GH#44703 went through indexing.maybe_convert_ix
first, second = indexer
if not (
second.size == 1 and (second == 0).all() and first.shape[1] == 1
):
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
indexer = first[:, 0]

elif lib.is_integer(indexer[1]) and indexer[1] == 0:
# reached via setitem_single_block passing the whole indexer
indexer = indexer[0]

elif com.is_null_slice(indexer[1]):
indexer = indexer[0]

elif is_list_like(indexer[1]) and indexer[1][0] == 0:
indexer = indexer[0]

else:
if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
# GH#44703 went through indexing.maybe_convert_ix
first, second = indexer
if not (
second.size == 1 and (second == 0).all() and first.shape[1] == 1
):
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
indexer = first[:, 0]

elif lib.is_integer(indexer[1]) and indexer[1] == 0:
# reached via setitem_single_block passing the whole indexer
indexer = indexer[0]

elif com.is_null_slice(indexer[1]):
indexer = indexer[0]

elif is_list_like(indexer[1]) and indexer[1][0] == 0:
indexer = indexer[0]

else:
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
return indexer

@property
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike

elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
# TODO(EA2D): special case not needed if all EAs used HybridBlocks
# NB: we are still assuming here that Hybrid blocks have shape (1, N)
# concatting with at least one EA means we are concatting a single column
# the non-EA values are 2D arrays with shape (1, n)

# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
Expand Down
Loading