From 22e35cd79fcafd07c10bfef986fb4864496636c7 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Nov 2022 17:04:00 -0700 Subject: [PATCH 1/2] REF: separate out seen.nat_ cases in maybe_convert_objects --- pandas/_libs/lib.pyx | 126 +++++++++++++++++------------------- pandas/core/indexes/base.py | 2 + 2 files changed, 60 insertions(+), 68 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1b871bf0b745f..76b4f4e52054a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2641,6 +2641,36 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True + if seen.nat_: + if seen.object_: + result = objects + elif seen.bool_: + result = objects + elif seen.null_: + result = objects + elif not safe and seen.nan_: + result = objects + elif seen.numeric_: + result = objects + else: + if convert_datetime and convert_timedelta: + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, objects.shape + ) + else: + result = objects + elif convert_datetime: + result = datetimes + elif convert_timedelta: + result = timedeltas + else: + result = objects + return result + return result + if not seen.object_: result = None if not safe: @@ -2660,36 +2690,15 @@ def maybe_convert_objects(ndarray[object] objects, result = floats else: if not seen.bool_: - if seen.datetime_: - if not seen.numeric_ and not seen.timedelta_: - result = datetimes - elif seen.timedelta_: - if not seen.numeric_: - result = timedeltas - elif seen.nat_: - if not seen.numeric_: - if convert_datetime and convert_timedelta: - dtype = dtype_if_all_nat - if dtype is not None: - # otherwise we keep object dtype - result = _infer_all_nats( - dtype, datetimes, timedeltas - ) - - elif convert_datetime: - result = datetimes - elif convert_timedelta: - result = timedeltas - else: - if seen.complex_: - result = complexes - elif seen.float_: - result = floats - elif seen.int_: - if seen.uint_: - result = uints - else: - result = ints + if seen.complex_: + result = complexes + elif seen.float_: + result = floats + elif seen.int_: + if seen.uint_: + result = uints + else: + result = ints elif seen.is_bool: result = bools.view(np.bool_) @@ -2705,38 +2714,17 @@ def maybe_convert_objects(ndarray[object] objects, result = floats else: if not seen.bool_: - if seen.datetime_: - if not seen.numeric_ and not seen.timedelta_: - result = datetimes - elif seen.timedelta_: - if not seen.numeric_: - result = timedeltas - elif seen.nat_: - if not seen.numeric_: - if convert_datetime and convert_timedelta: - dtype = dtype_if_all_nat - if dtype is not None: - # otherwise we keep object dtype - result = _infer_all_nats( - dtype, datetimes, timedeltas - ) - - elif convert_datetime: - result = datetimes - elif convert_timedelta: - result = timedeltas - else: - if seen.complex_: - if not seen.int_: - result = complexes - elif seen.float_ or seen.nan_: - if not seen.int_: - result = floats - elif seen.int_: - if seen.uint_: - result = uints - else: - result = ints + if seen.complex_: + if not seen.int_: + result = complexes + elif seen.float_ or seen.nan_: + if not seen.int_: + result = floats + elif seen.int_: + if seen.uint_: + result = uints + else: + result = ints elif seen.is_bool and not seen.nan_: result = bools.view(np.bool_) @@ -2751,22 +2739,24 @@ def maybe_convert_objects(ndarray[object] objects, return objects -cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): +cdef _infer_all_nats(dtype, cnp.npy_intp* shape): """ If we have all-NaT values, cast these to the given dtype. """ if cnp.PyArray_DescrCheck(dtype): # i.e. isinstance(dtype, np.dtype): - if dtype == "M8[ns]": - result = datetimes - elif dtype == "m8[ns]": - result = timedeltas + if dtype == "M8[ns]" or dtype == "m8[ns]": + pass else: raise ValueError(dtype) + + i8vals = cnp.PyArray_EMPTY(1, shape, cnp.NPY_INT64, 0) + i8vals.fill(NPY_NAT) + result = i8vals.view(dtype) else: # ExtensionDtype cls = dtype.construct_array_type() - i8vals = cnp.PyArray_EMPTY(1, datetimes.shape, cnp.NPY_INT64, 0) + i8vals = cnp.PyArray_EMPTY(1, shape, cnp.NPY_INT64, 0) i8vals.fill(NPY_NAT) result = cls(i8vals, dtype=dtype) return result diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10c2349f05dfd..513c723546f9b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1361,6 +1361,8 @@ def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t] if is_object_dtype(values.dtype): values = cast(np.ndarray, values) + # Only place where we pass safe=True, only needed for + # test_format_missing values = lib.maybe_convert_objects(values, safe=True) result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] From 15e7c7e07488dbd6207a51af24147314382c3e9b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Nov 2022 09:26:34 -0700 Subject: [PATCH 2/2] REF: simplify maybe_convert_objects --- pandas/_libs/lib.pyx | 98 ++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 76b4f4e52054a..b7d5b907129c8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1284,13 +1284,12 @@ cdef class Seen: @property def is_bool(self): - return not (self.datetime_ or self.numeric_ or self.timedelta_ - or self.nat_) - - @property - def is_float_or_complex(self): - return not (self.bool_ or self.datetime_ or self.timedelta_ - or self.nat_) + # i.e. not (anything but bool) + return not ( + self.datetime_ or self.datetimetz_ or self.timedelta_ or self.nat_ + or self.period_ or self.interval_ + or self.numeric_ or self.nan_ or self.null_ or self.object_ + ) cdef object _try_infer_map(object dtype): @@ -2671,62 +2670,53 @@ def maybe_convert_objects(ndarray[object] objects, return result return result + if seen.bool_: + if seen.is_bool: + # is_bool property rules out everything else + return bools.view(np.bool_) + seen.object_ = True + if not seen.object_: result = None if not safe: - if seen.null_ or seen.nan_: - if seen.is_float_or_complex: - if seen.complex_: - result = complexes - elif seen.float_: - result = floats - elif seen.int_: - if convert_to_nullable_integer: - from pandas.core.arrays import IntegerArray - result = IntegerArray(ints, mask) - else: - result = floats - elif seen.nan_: + if seen.complex_: + result = complexes + elif seen.float_: + result = floats + elif seen.null_ or seen.nan_: + if seen.int_: + if convert_to_nullable_integer: + from pandas.core.arrays import IntegerArray + result = IntegerArray(ints, mask) + else: result = floats + elif seen.nan_: + result = floats else: - if not seen.bool_: - if seen.complex_: - result = complexes - elif seen.float_: - result = floats - elif seen.int_: - if seen.uint_: - result = uints - else: - result = ints - elif seen.is_bool: - result = bools.view(np.bool_) + if seen.int_: + if seen.uint_: + result = uints + else: + result = ints else: # don't cast int to float, etc. - if seen.null_: - if seen.is_float_or_complex: - if seen.complex_: - if not seen.int_: - result = complexes - elif seen.float_ or seen.nan_: - if not seen.int_: - result = floats + if seen.int_: + if seen.null_ or seen.nan_ or seen.float_ or seen.complex_: + # we have seen something other than int, so we do not + # convert with safe=True. + pass + else: + if seen.uint_: + result = uints + else: + result = ints + else: - if not seen.bool_: - if seen.complex_: - if not seen.int_: - result = complexes - elif seen.float_ or seen.nan_: - if not seen.int_: - result = floats - elif seen.int_: - if seen.uint_: - result = uints - else: - result = ints - elif seen.is_bool and not seen.nan_: - result = bools.view(np.bool_) + if seen.complex_: + result = complexes + elif seen.float_ or seen.nan_: + result = floats if result is uints or result is ints or result is floats or result is complexes: # cast to the largest itemsize when all values are NumPy scalars