From 79695e5a2f6dcd36e4fbf490adbb624fe4eee58e Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 28 Apr 2023 09:48:04 -0600 Subject: [PATCH 01/11] [skip-ci] Add cftime groupby, resample benchmarks xref #7730 --- asv_bench/benchmarks/groupby.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 8cd23f3947c..d9a7ab1f747 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -132,3 +132,34 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds1d = self.ds1d.chunk({"time": 50}) self.ds2d = self.ds2d.chunk({"time": 50, "z": 4}) + + +class ResampleCFTime: + def setup(self, *args, **kwargs): + self.ds1d = xr.Dataset( + { + "b": ("time", np.arange(365.0 * 24)), + }, + coords={ + "time": xr.date_range( + "2001-01-01", freq="H", periods=365 * 24, calendar="noleap" + ) + }, + ) + self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.resample(time="48H").mean() + self.ds2d_mean = self.ds2d.resample(time="48H").mean() + + +class GroupByCFTime: + def setup(self, *args, **kwargs): + arr = np.random.randn(10, 10, 365 * 30) + time = xr.date_range("2000", periods=30 * 365, calendar="noleap") + self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time}) + self.gb = self.da.groupby("time.year") + + def time_init(self, ndim): + self.da.groupby("time.year") + + def time_mean(self): + self.gb.mean() From 07996c7cca81b087b5c3c744a149461cebac3226 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 11:53:45 -0600 Subject: [PATCH 02/11] [skip-ci]try setting temp dir --- .github/workflows/benchmarks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b9a8d773c5a..41f074d0599 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -45,6 +45,8 @@ jobs: set -x # ID this runner asv machine --yes + export TMPDIR=$RUNNER_TEMP + printenv echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" # Use mamba for env creation From da076e6cfb7f60cc595cb113787c6bd7d88da719 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 15:25:38 -0600 Subject: [PATCH 03/11] [skip-ci] try mamba? --- .github/workflows/benchmarks.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 41f074d0599..cddd3f8772d 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -30,6 +30,7 @@ jobs: cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" extra-specs: | asv + mamba - name: Run benchmarks @@ -50,8 +51,8 @@ jobs: echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" # Use mamba for env creation - # export CONDA_EXE=$(which mamba) - export CONDA_EXE=$(which conda) + export CONDA_EXE=$(which mamba) + # export CONDA_EXE=$(which conda) # Run benchmarks for current commit against base ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ From f2dc76ca8194f9b0a9d98945f5f4e2e4dc1dfc43 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 15:56:49 -0600 Subject: [PATCH 04/11] [skip-ci] increase conda verbosity --- .github/workflows/benchmarks.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index cddd3f8772d..15ecd10d406 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -30,7 +30,6 @@ jobs: cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" extra-specs: | asv - mamba - name: Run benchmarks @@ -47,12 +46,12 @@ jobs: # ID this runner asv machine --yes export TMPDIR=$RUNNER_TEMP - printenv + export CONDA_VERBOSITY=3 echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" # Use mamba for env creation - export CONDA_EXE=$(which mamba) - # export CONDA_EXE=$(which conda) + # export CONDA_EXE=$(which mamba) + export CONDA_EXE=$(which conda) # Run benchmarks for current commit against base ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ From 5a56cb91ea8ea7a39f20f065201af101111e4188 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 16:12:04 -0600 Subject: [PATCH 05/11] [skip-ci] specify channels --- .github/workflows/benchmarks.yml | 1 - asv_bench/asv.conf.json | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 15ecd10d406..84e9f492113 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -46,7 +46,6 @@ jobs: # ID this runner asv machine --yes export TMPDIR=$RUNNER_TEMP - export CONDA_VERBOSITY=3 echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" # Use mamba for env creation diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index f8387aca856..a709d0a51a7 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -30,6 +30,7 @@ // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", + "conda_channels": ["conda-forge"], // timeout in seconds for installing any dependencies in environment // defaults to 10 min From 358b0a2e539b9fea1e5a1ba711002d1b7719235d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 May 2023 16:23:34 -0600 Subject: [PATCH 06/11] [skip-ci] Update .github/workflows/benchmarks.yml --- .github/workflows/benchmarks.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 84e9f492113..b9a8d773c5a 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -45,7 +45,6 @@ jobs: set -x # ID this runner asv machine --yes - export TMPDIR=$RUNNER_TEMP echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" # Use mamba for env creation From 648658b2e2939a8e9f0b5b61afa4de2c4e2a152c Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 17:12:08 -0600 Subject: [PATCH 07/11] [skip-ci] bugfix --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d9a7ab1f747..7ef3463ec25 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -158,7 +158,7 @@ def setup(self, *args, **kwargs): self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time}) self.gb = self.da.groupby("time.year") - def time_init(self, ndim): + def time_init(self): self.da.groupby("time.year") def time_mean(self): From 9510678d44daed6e0ed3e0db39bd0be31a00f464 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 21:41:41 -0600 Subject: [PATCH 08/11] [skip-ci] Parameterize use_flox --- asv_bench/benchmarks/groupby.py | 53 ++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7ef3463ec25..a60eed8fde8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -18,7 +18,7 @@ def setup(self, *args, **kwargs): "c": xr.DataArray(np.arange(2 * self.n)), } ) - self.ds2d = self.ds1d.expand_dims(z=10) + self.ds2d = self.ds1d.expand_dims(z=10).copy() self.ds1d_mean = self.ds1d.groupby("b").mean() self.ds2d_mean = self.ds2d.groupby("b").mean() @@ -26,15 +26,21 @@ def setup(self, *args, **kwargs): def time_init(self, ndim): getattr(self, f"ds{ndim}d").groupby("b") - @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) - def time_agg_small_num_groups(self, method, ndim): + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_small_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") - getattr(ds.groupby("a"), method)().compute() + with xr.set_options(use_flox=use_flox): + getattr(ds.groupby("a"), method)().compute() - @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) - def time_agg_large_num_groups(self, method, ndim): + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_large_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") - getattr(ds.groupby("b"), method)().compute() + with xr.set_options(use_flox=use_flox): + getattr(ds.groupby("b"), method)().compute() def time_binary_op_1d(self): (self.ds1d.groupby("b") - self.ds1d_mean).compute() @@ -115,10 +121,13 @@ def setup(self, *args, **kwargs): def time_init(self, ndim): getattr(self, f"ds{ndim}d").resample(time="D") - @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) - def time_agg_small_num_groups(self, method, ndim): + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_small_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="3M"), method)().compute() + with xr.set_options(use_flox=use_flox): + getattr(ds.resample(time="3M"), method)().compute() @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) def time_agg_large_num_groups(self, method, ndim): @@ -151,15 +160,23 @@ def setup(self, *args, **kwargs): self.ds2d_mean = self.ds2d.resample(time="48H").mean() -class GroupByCFTime: - def setup(self, *args, **kwargs): +class GroupByLongTime: + params = [[True, False], [True, False]] + param_names = ["use_cftime", "use_flox"] + + def setup(self, use_cftime, use_flox): arr = np.random.randn(10, 10, 365 * 30) - time = xr.date_range("2000", periods=30 * 365, calendar="noleap") + time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime) self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time}) - self.gb = self.da.groupby("time.year") - def time_init(self): - self.da.groupby("time.year") + def time_mean(self, use_cftime, use_flox): + with xr.set_options(use_flox=use_flox): + self.da.groupby("time.year").mean() + - def time_mean(self): - self.gb.mean() +# class GroupByLongCFTime(GroupByLongTime): +# def setup(self, *args, **kwargs): +# arr = np.random.randn(10, 10, 365 * 30) +# time = xr.date_range("2000", periods=30 * 365, calendar="noleap") +# self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time}) +# self.gb = self.da.groupby("time.year") From ab655c93345455d57635dd0bad633486535aa7f6 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 1 May 2023 22:12:56 -0600 Subject: [PATCH 09/11] [skip-ci] cleanup --- asv_bench/benchmarks/groupby.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index a60eed8fde8..475c02b86a1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -172,11 +172,3 @@ def setup(self, use_cftime, use_flox): def time_mean(self, use_cftime, use_flox): with xr.set_options(use_flox=use_flox): self.da.groupby("time.year").mean() - - -# class GroupByLongCFTime(GroupByLongTime): -# def setup(self, *args, **kwargs): -# arr = np.random.randn(10, 10, 365 * 30) -# time = xr.date_range("2000", periods=30 * 365, calendar="noleap") -# self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time}) -# self.gb = self.da.groupby("time.year") From dfb51a7710411e41d97bf1db8151ab55defdbf7f Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 2 May 2023 08:40:34 -0600 Subject: [PATCH 10/11] [skip-ci] fixes --- asv_bench/benchmarks/groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 475c02b86a1..7447dfa79bc 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -143,7 +143,7 @@ def setup(self, *args, **kwargs): self.ds2d = self.ds2d.chunk({"time": 50, "z": 4}) -class ResampleCFTime: +class ResampleCFTime(Resample): def setup(self, *args, **kwargs): self.ds1d = xr.Dataset( { @@ -160,10 +160,8 @@ def setup(self, *args, **kwargs): self.ds2d_mean = self.ds2d.resample(time="48H").mean() +@parameterized(["use_cftime", "use_flox"], [[True, False], [True, False]]) class GroupByLongTime: - params = [[True, False], [True, False]] - param_names = ["use_cftime", "use_flox"] - def setup(self, use_cftime, use_flox): arr = np.random.randn(10, 10, 365 * 30) time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime) From 00fd3be54fedf54d1891f0a4ef37ae98d9fb78b2 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 2 May 2023 09:10:15 -0600 Subject: [PATCH 11/11] [skip-ci] fix resample parameterizing --- asv_bench/benchmarks/groupby.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7447dfa79bc..1b3e55fa659 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -129,10 +129,13 @@ def time_agg_small_num_groups(self, method, ndim, use_flox): with xr.set_options(use_flox=use_flox): getattr(ds.resample(time="3M"), method)().compute() - @parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)]) - def time_agg_large_num_groups(self, method, ndim): + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_large_num_groups(self, method, ndim, use_flox): ds = getattr(self, f"ds{ndim}d") - getattr(ds.resample(time="48H"), method)().compute() + with xr.set_options(use_flox=use_flox): + getattr(ds.resample(time="48H"), method)().compute() class ResampleDask(Resample):