Add first, last property test

dcherian · dcherian · commit dfb968f01a16 · 2024-07-26T21:24:26.000-06:00
Closes #29
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -69,6 +69,9 @@ def generic_aggregate(
     if func == "identity":
         return array
 
+    if func in ["nanfirst", "nanlast"] and array.dtype.kind in "US":
+        func = func[3:]
+
     if engine == "flox":
         try:
             method = getattr(aggregate_flox, func)
@@ -144,6 +147,8 @@ def _maybe_promote_int(dtype) -> np.dtype:
 
 def _get_fill_value(dtype, fill_value):
     """Returns dtype appropriate infinity. Returns +Inf equivalent for None."""
+    if fill_value in [None, dtypes.NA] and dtype.kind in "US":
+        return ""
     if fill_value == dtypes.INF or fill_value is None:
         return dtypes.get_pos_infinity(dtype, max_for_int=True)
     if fill_value == dtypes.NINF:
@@ -516,10 +521,10 @@ def _pick_second(*x):
     final_dtype=np.intp,
 )
 
-first = Aggregation("first", chunk=None, combine=None, fill_value=0)
-last = Aggregation("last", chunk=None, combine=None, fill_value=0)
-nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=np.nan)
-nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=np.nan)
+first = Aggregation("first", chunk=None, combine=None, fill_value=None)
+last = Aggregation("last", chunk=None, combine=None, fill_value=None)
+nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=dtypes.NA)
+nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=dtypes.NA)
 
 all_ = Aggregation(
     "all",
@@ -808,7 +813,7 @@ def _initialize_aggregation(
     )
 
     final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value)
-    if agg.name not in ["min", "max", "nanmin", "nanmax"]:
+    if agg.name not in ["first", "last", "nanfirst", "nanlast", "min", "max", "nanmin", "nanmax"]:
         final_dtype = _maybe_promote_int(final_dtype)
     agg.dtype = {
         "user": dtype,  # Save to automatically choose an engine
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
@@ -100,6 +100,7 @@ def get_pos_infinity(dtype, max_for_int=False):
 
     if issubclass(dtype.type, np.integer):
         if max_for_int:
+            dtype = np.int64 if dtype.kind in "Mm" else dtype
             return np.iinfo(dtype).max
         else:
             return np.inf
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -101,25 +101,32 @@ def assert_equal(a, b, tolerance=None):
         else:
             tolerance = {}
 
-    if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
-        # sometimes it's nice to see values and shapes
-        # rather than being dropped into some file in dask
-        np.testing.assert_allclose(a, b, **tolerance)
-        # does some validation of the dask graph
-        da.utils.assert_eq(a, b, equal_nan=True)
+    # Always run the numpy comparison first, so that we get nice error messages with dask.
+    # sometimes it's nice to see values and shapes
+    # rather than being dropped into some file in dask
+    if a.dtype != b.dtype:
+        raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})")
+
+    if has_dask:
+        a_eager = a.compute() if isinstance(a, dask_array_type) else a
+        b_eager = b.compute() if isinstance(b, dask_array_type) else b
+
+    if a.dtype.kind in "SUMm":
+        np.testing.assert_equal(a_eager, b_eager)
     else:
-        if a.dtype != b.dtype:
-            raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})")
+        np.testing.assert_allclose(a_eager, b_eager, equal_nan=True, **tolerance)
 
-        np.testing.assert_allclose(a, b, equal_nan=True, **tolerance)
+    if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
+        # does some validation of the dask graph
+        dask_assert_eq(a, b, equal_nan=True)
 
 
 def assert_equal_tuple(a, b):
     """assert_equal for .blocks indexing tuples"""
     assert len(a) == len(b)
 
     for a_, b_ in zip(a, b):
-        assert type(a_) == type(b_)
+        assert type(a_) is type(b_)
         if isinstance(a_, np.ndarray):
             np.testing.assert_array_equal(a_, b_)
         else:
@@ -156,3 +163,91 @@ def assert_equal_tuple(a, b):
     "quantile",
     "nanquantile",
 ) + tuple(SCIPY_STATS_FUNCS)
+
+
+def dask_assert_eq(
+    a,
+    b,
+    check_shape=True,
+    check_graph=True,
+    check_meta=True,
+    check_chunks=True,
+    check_ndim=True,
+    check_type=True,
+    check_dtype=True,
+    equal_nan=True,
+    scheduler="sync",
+    **kwargs,
+):
+    """dask.array.utils.assert_eq modified to skip value checks. Their code is buggy for some dtypes.
+    We just check values through numpy and care about validating the graph in this function."""
+    from dask.array.utils import _get_dt_meta_computed
+
+    a_original = a
+    b_original = b
+
+    if isinstance(a, (list, int, float)):
+        a = np.array(a)
+    if isinstance(b, (list, int, float)):
+        b = np.array(b)
+
+    a, adt, a_meta, a_computed = _get_dt_meta_computed(
+        a,
+        check_shape=check_shape,
+        check_graph=check_graph,
+        check_chunks=check_chunks,
+        check_ndim=check_ndim,
+        scheduler=scheduler,
+    )
+    b, bdt, b_meta, b_computed = _get_dt_meta_computed(
+        b,
+        check_shape=check_shape,
+        check_graph=check_graph,
+        check_chunks=check_chunks,
+        check_ndim=check_ndim,
+        scheduler=scheduler,
+    )
+
+    if check_type:
+        _a = a if a.shape else a.item()
+        _b = b if b.shape else b.item()
+        assert type(_a) is type(_b), f"a and b have different types (a: {type(_a)}, b: {type(_b)})"
+    if check_meta:
+        if hasattr(a, "_meta") and hasattr(b, "_meta"):
+            dask_assert_eq(a._meta, b._meta)
+        if hasattr(a_original, "_meta"):
+            msg = (
+                f"compute()-ing 'a' changes its number of dimensions "
+                f"(before: {a_original._meta.ndim}, after: {a.ndim})"
+            )
+            assert a_original._meta.ndim == a.ndim, msg
+            if a_meta is not None:
+                msg = (
+                    f"compute()-ing 'a' changes its type "
+                    f"(before: {type(a_original._meta)}, after: {type(a_meta)})"
+                )
+                assert type(a_original._meta) is type(a_meta), msg
+                if not (np.isscalar(a_meta) or np.isscalar(a_computed)):
+                    msg = (
+                        f"compute()-ing 'a' results in a different type than implied by its metadata "
+                        f"(meta: {type(a_meta)}, computed: {type(a_computed)})"
+                    )
+                    assert type(a_meta) is type(a_computed), msg
+        if hasattr(b_original, "_meta"):
+            msg = (
+                f"compute()-ing 'b' changes its number of dimensions "
+                f"(before: {b_original._meta.ndim}, after: {b.ndim})"
+            )
+            assert b_original._meta.ndim == b.ndim, msg
+            if b_meta is not None:
+                msg = (
+                    f"compute()-ing 'b' changes its type "
+                    f"(before: {type(b_original._meta)}, after: {type(b_meta)})"
+                )
+                assert type(b_original._meta) is type(b_meta), msg
+                if not (np.isscalar(b_meta) or np.isscalar(b_computed)):
+                    msg = (
+                        f"compute()-ing 'b' results in a different type than implied by its metadata "
+                        f"(meta: {type(b_meta)}, computed: {type(b_computed)})"
+                    )
+                    assert type(b_meta) is type(b_computed), msg
diff --git a/tests/test_properties.py b/tests/test_properties.py
@@ -59,6 +59,12 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
 func_st = st.sampled_from(
     [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]
 )
+numeric_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
+)
+all_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
+)
 
 
 def by_arrays(shape):
@@ -81,13 +87,7 @@ def not_overflowing_array(array) -> bool:
     return result
 
 
-@given(
-    array=npst.arrays(
-        elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
-    ),
-    dtype=by_dtype_st,
-    func=func_st,
-)
+@given(array=numeric_arrays, dtype=by_dtype_st, func=func_st)
 def test_groupby_reduce(array, dtype, func):
     # overflow behaviour differs between bincount and sum (for example)
     assume(not_overflowing_array(array))
@@ -149,17 +149,7 @@ def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
 
 
 @st.composite
-def chunked_arrays(
-    draw,
-    *,
-    chunks=chunks,
-    arrays=npst.arrays(
-        elements={"allow_subnormal": False},
-        shape=npst.array_shapes(max_side=10),
-        dtype=array_dtype_st,
-    ),
-    from_array=dask.array.from_array,
-):
+def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
     array = draw(arrays)
     chunks = draw(chunks(shape=array.shape))
 
@@ -216,6 +206,7 @@ def test_scans(data, array, func):
 
 @given(data=st.data(), array=chunked_arrays())
 def test_ffill_bfill_reverse(data, array):
+    # TODO: test NaT and timedelta, datetime
     assume(not_overflowing_array(np.asarray(array)))
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
 
@@ -230,3 +221,38 @@ def reverse(arr):
         backward = groupby_scan(a, by, func="bfill")
         forward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="ffill"))
         assert_equal(forward_reversed, backward)
+
+
+@given(
+    data=st.data(),
+    array=chunked_arrays(arrays=all_arrays),
+    func=st.sampled_from(["first", "last", "nanfirst", "nanlast"]),
+)
+def test_first_last(data, array, func):
+    by = data.draw(by_arrays(shape=(array.shape[-1],)))
+
+    INVERSES = {"first": "last", "last": "first", "nanfirst": "nanlast", "nanlast": "nanfirst"}
+    MATES = {"first": "nanfirst", "last": "nanlast", "nanfirst": "first", "nanlast": "last"}
+    inverse = INVERSES[func]
+    mate = MATES[func]
+
+    if func in ["first", "last"]:
+        array = array.rechunk((*array.chunks[:-1], -1))
+
+    for arr in [array, array.compute()]:
+        forward, fg = groupby_reduce(arr, by, func=func, engine="flox")
+        reverse, rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox")
+
+        assert forward.dtype == reverse.dtype
+        assert forward.dtype == arr.dtype
+
+        assert_equal(fg, rg)
+        assert_equal(forward, reverse)
+
+    if arr.dtype.kind == "f" and not np.isnan(array.compute()).any():
+        if mate in ["first", "last"]:
+            array = array.rechunk((*array.chunks[:-1], -1))
+
+        first, _ = groupby_reduce(array, by, func=func, engine="flox")
+        second, _ = groupby_reduce(array, by, func=mate, engine="flox")
+        assert_equal(first, second)