From 03edc1f9a3f79c824a576ca08a84f3d4f8ce3c73 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 30 Nov 2023 23:21:51 -0700 Subject: [PATCH] Faster factorize --- asv_bench/benchmarks/factorize.py | 69 +++++++++++++++++++++++++++++++ flox/core.py | 40 +++++++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 asv_bench/benchmarks/factorize.py diff --git a/asv_bench/benchmarks/factorize.py b/asv_bench/benchmarks/factorize.py new file mode 100644 index 000000000..7d2eb4ecd --- /dev/null +++ b/asv_bench/benchmarks/factorize.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import numpy as np +import pandas as pd +from asv_runner.benchmarks.mark import parameterize + +import flox + +Nsmall = 4 +Nlarge = 2000 + + +class Factorize: + """Time the core factorize_ function.""" + + def setup(self, *args, **kwargs): + raise NotImplementedError + + @parameterize( + { + "expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)), + "reindex": [True, False], + "sort": [True, False], + } + ) + def time_factorize_small(self, expected, reindex, sort): + flox.core.factorize_( + self.by_small, + axes=(-1,), + expected_groups=expected, + reindex=reindex, + sort=sort, + ) + + @parameterize( + { + "expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)), + "reindex": [True, False], + "sort": [True, False], + } + ) + def time_factorize_large(self, expected, reindex, sort): + flox.core.factorize_( + self.by_large, + axes=(-1,), + expected_groups=None, + reindex=reindex, + sort=sort, + ) + + +class SingleGrouper1D(Factorize): + def setup(self, *args, **kwargs): + self.by_small = (np.repeat(np.arange(Nsmall), 250),) + self.by_large = (np.random.permutation(np.arange(Nlarge)),) + + +class SingleGrouper3D(Factorize): + def setup(self, *args, **kwargs): + self.by_small = (np.broadcast_to(np.repeat(np.arange(Nsmall), 250), (5, 5, 1000)),) + self.by_large = (np.broadcast_to(np.random.permutation(np.arange(Nlarge)), (5, 5, Nlarge)),) + + +# class Multiple(Factorize): +# def setup(self, *args, **kwargs): +# pass + +# class CFTimeFactorize(Factorize): +# pass diff --git a/flox/core.py b/flox/core.py index cc7211373..66d33338d 100644 --- a/flox/core.py +++ b/flox/core.py @@ -541,6 +541,31 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]: return offset, size +def fast_isin(ar1, ar2, invert): + """ + Faster version of numpy isin. + 1. Use pd.factorize instead of np.unique + 2. Skip a bunch of checks + """ + rev_idx, ar1 = pd.factorize(ar1, sort=False) + + ar = np.concatenate((ar1, ar2)) + # We need this to be a stable sort, so always use 'mergesort' + # here. The values from the first array should always come before + # the values from the second array. + order = ar.argsort(kind="mergesort") + sar = ar[order] + if invert: + bool_ar = sar[1:] != sar[:-1] + else: + bool_ar = sar[1:] == sar[:-1] + flag = np.concatenate((bool_ar, [invert])) + ret = np.empty(ar.shape, dtype=bool) + ret[order] = flag + + return ret[rev_idx] + + @overload def factorize_( by: T_Bys, @@ -639,12 +664,23 @@ def factorize_( if expect is not None and reindex: sorter = np.argsort(expect) groups = expect[(sorter,)] if sort else expect + idx = np.searchsorted(expect, flat, sorter=sorter) - mask = ~np.isin(flat, expect) | isnull(flat) | (idx == len(expect)) + mask = fast_isin(flat, expect, invert=True) + if not np.issubdtype(flat.dtype, np.integer): + mask |= isnull(flat) + outside_last_elem_mask = idx == len(expect) + mask |= outside_last_elem_mask + + # idx = np.full(flat.shape, -1) + # result = np.searchsorted(expect.values, flat[~mask], sorter=sorter) + # idx[~mask] = result + # idx = np.searchsorted(expect.values, flat, sorter=sorter) + # idx[mask] = -1 if not sort: # idx is the index in to the sorted array. # if we didn't want sorting, unsort it back - idx[(idx == len(expect),)] = -1 + idx[(outside_last_elem_mask)] = -1 idx = sorter[(idx,)] idx[mask] = -1 else: