Skip to content

REF: move inference/casting out of Index.__new__ #30596

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 1, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 77 additions & 81 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,43 +370,12 @@ def __new__(
subarr = subarr.copy()

if dtype is None:
inferred = lib.infer_dtype(subarr, skipna=False)
if inferred == "integer":
try:
return cls._try_convert_to_int_index(subarr, copy, name, dtype)
except ValueError:
pass

return Index(subarr, copy=copy, dtype=object, name=name)
elif inferred in ["floating", "mixed-integer-float", "integer-na"]:
# TODO: Returns IntegerArray for integer-na case in the future
return Float64Index(subarr, copy=copy, name=name)
elif inferred == "interval":
try:
return IntervalIndex(subarr, name=name, copy=copy)
except ValueError:
# GH27172: mixed closed Intervals --> object dtype
pass
elif inferred == "boolean":
# don't support boolean explicitly ATM
pass
elif inferred != "string":
if inferred.startswith("datetime"):
try:
return DatetimeIndex(subarr, copy=copy, name=name, **kwargs)
except (ValueError, OutOfBoundsDatetime):
# GH 27011
# If we have mixed timezones, just send it
# down the base constructor
pass

elif inferred.startswith("timedelta"):
return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs)
elif inferred == "period":
try:
return PeriodIndex(subarr, name=name, **kwargs)
except IncompatibleFrequency:
pass
new_data, new_dtype = _maybe_cast_data_without_dtype(subarr)
if new_dtype is not None:
return cls(
new_data, dtype=new_dtype, copy=False, name=name, **kwargs
)

if kwargs:
raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}")
return cls._simple_new(subarr, name, **kwargs)
Expand Down Expand Up @@ -3806,50 +3775,6 @@ def where(self, cond, other=None):
return self._shallow_copy_with_infer(values, dtype=dtype)

# construction helpers
@classmethod
def _try_convert_to_int_index(cls, data, copy, name, dtype):
"""
Attempt to convert an array of data into an integer index.

Parameters
----------
data : The data to convert.
copy : Whether to copy the data or not.
name : The name of the index returned.

Returns
-------
int_index : data converted to either an Int64Index or a
UInt64Index

Raises
------
ValueError if the conversion was not successful.
"""

from .numeric import Int64Index, UInt64Index

if not is_unsigned_integer_dtype(dtype):
# skip int64 conversion attempt if uint-like dtype is passed, as
# this could return Int64Index when UInt64Index is what's desired
try:
res = data.astype("i8", copy=False)
if (res == data).all():
return Int64Index(res, copy=copy, name=name)
except (OverflowError, TypeError, ValueError):
pass

# Conversion to int64 failed (possibly due to overflow) or was skipped,
# so let's try now with uint64.
try:
res = data.astype("u8", copy=False)
if (res == data).all():
return UInt64Index(res, copy=copy, name=name)
except (OverflowError, TypeError, ValueError):
pass

raise ValueError

@classmethod
def _scalar_data_error(cls, data):
# We return the TypeError so that we can raise it from the constructor
Expand Down Expand Up @@ -5509,6 +5434,77 @@ def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.
return data


def _maybe_cast_data_without_dtype(subarr):
"""
If we have an arraylike input but no passed dtype, try to infer
a supported dtype.

Parameters
----------
subarr : np.ndarray, Index, or Series

Returns
-------
converted : np.ndarray or ExtensionArray
dtype : np.dtype or ExtensionDtype
"""
# Runtime import needed bc IntervalArray imports Index
from pandas.core.arrays import (
IntervalArray,
PeriodArray,
DatetimeArray,
TimedeltaArray,
)

inferred = lib.infer_dtype(subarr, skipna=False)

if inferred == "integer":
try:
data = _try_convert_to_int_array(subarr, False, None)
return data, data.dtype
except ValueError:
pass

return subarr, object

elif inferred in ["floating", "mixed-integer-float", "integer-na"]:
# TODO: Returns IntegerArray for integer-na case in the future
return subarr, np.float64

elif inferred == "interval":
try:
data = IntervalArray._from_sequence(subarr, copy=False)
return data, data.dtype
except ValueError:
# GH27172: mixed closed Intervals --> object dtype
pass
elif inferred == "boolean":
# don't support boolean explicitly ATM
pass
elif inferred != "string":
if inferred.startswith("datetime"):
try:
data = DatetimeArray._from_sequence(subarr, copy=False)
return data, data.dtype
except (ValueError, OutOfBoundsDatetime):
# GH 27011
# If we have mixed timezones, just send it
# down the base constructor
pass

elif inferred.startswith("timedelta"):
data = TimedeltaArray._from_sequence(subarr, copy=False)
return data, data.dtype
elif inferred == "period":
try:
data = PeriodArray._from_sequence(subarr)
return data, data.dtype
except IncompatibleFrequency:
pass

return subarr, subarr.dtype


def _try_convert_to_int_array(
data: np.ndarray, copy: bool, dtype: np.dtype
) -> np.ndarray:
Expand Down