From c9ff80023fb329444a2e12cb4e5b67843535a5f7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 15:33:16 -0400 Subject: [PATCH 1/4] precommit fixup --- pandas/_libs/algos.pyx | 95 ++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6cc55648b9cf4..f2efeedb80d4d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -383,8 +383,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat - ndarray[float64_t, ndim=1] maskedx - ndarray[float64_t, ndim=1] maskedy + ndarray[float64_t, ndim=1] rankedx, rankedy + float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor @@ -399,56 +399,61 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr ranked_mat = np.empty((N, K), dtype=np.float64) + # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe + # here since N >= nobs and values are stored contiguously + maskedx = np.empty(N, dtype=np.float64) + maskedy = np.empty(N, dtype=np.float64) for i in range(K): ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) - for xi in range(K): - for yi in range(xi + 1): - nobs = 0 - # Keep track of whether we need to recompute ranks - all_ranks = True - for i in range(N): - all_ranks &= not (mask[i, xi] ^ mask[i, yi]) - if mask[i, xi] and mask[i, yi]: - nobs += 1 - - if nobs < minp: - result[xi, yi] = result[yi, xi] = NaN - else: - maskedx = np.empty(nobs, dtype=np.float64) - maskedy = np.empty(nobs, dtype=np.float64) - j = 0 - + with nogil: + for xi in range(K): + for yi in range(xi + 1): + nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] - maskedy[j] = ranked_mat[i, yi] - j += 1 - - if not all_ranks: - labels_nobs = np.zeros(nobs, dtype=np.int64) - maskedx = rank_1d(maskedx, labels=labels_nobs) - maskedy = rank_1d(maskedy, labels=labels_nobs) - - mean = (nobs + 1) / 2. - - # now the cov numerator - sumx = sumxx = sumyy = 0 - - for i in range(nobs): - vx = maskedx[i] - mean - vy = maskedy[i] - mean - - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy - - divisor = sqrt(sumxx * sumyy) + maskedx[nobs] = ranked_mat[i, xi] + maskedy[nobs] = ranked_mat[i, yi] + nobs += 1 - if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor - else: + if nobs < minp: result[xi, yi] = result[yi, xi] = NaN + else: + if not all_ranks: + with gil: + # We need to slice back to nobs because rank_1d will + # require arrays of nobs length + labels_nobs = np.zeros(nobs, dtype=np.int64) + rankedx = rank_1d(np.array(maskedx)[:nobs], + labels=labels_nobs) + rankedy = rank_1d(np.array(maskedy)[:nobs], + labels=labels_nobs) + for i in range(nobs): + maskedx[i] = rankedx[i] + maskedy[i] = rankedy[i] + + mean = (nobs + 1) / 2. + + # now the cov numerator + sumx = sumxx = sumyy = 0 + + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = NaN return result From d60902afe196d43e97d0a7d4f0260e6a8c5c446c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 15:41:33 -0400 Subject: [PATCH 2/4] Add benchmark seed for stability --- asv_bench/benchmarks/stat_ops.py | 3 ++- doc/source/whatsnew/v1.3.0.rst | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5639d6702a92c..64842286ac07c 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -95,10 +95,11 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [["spearman", "kendall", "pearson"]] + params = [["spearman"]] param_names = ["method"] def setup(self, method): + np.random.seed(0) self.df = pd.DataFrame(np.random.randn(500, 15)) self.df2 = pd.DataFrame(np.random.randn(500, 15)) self.df_wide = pd.DataFrame(np.random.randn(500, 100)) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8b413808503ad..d24aaf1affccc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -806,6 +806,7 @@ Performance improvements - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`) - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`) - Performance improvement in :func:`unique` for object data type (:issue:`37615`) From 88860595bfe245b95a8db64e31b441e15d2f04af Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 15:50:00 -0400 Subject: [PATCH 3/4] Add back all bench methods --- asv_bench/benchmarks/stat_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 64842286ac07c..ddba88db830f0 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -95,7 +95,7 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [["spearman"]] + params = [["spearman", "kendall", "pearson"]] param_names = ["method"] def setup(self, method): From ff9519f0cd74c03dca269e0a4a4c45244d89fbbd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 12:27:39 -0400 Subject: [PATCH 4/4] Remove random seed --- asv_bench/benchmarks/stat_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ddba88db830f0..5639d6702a92c 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -99,7 +99,6 @@ class Correlation: param_names = ["method"] def setup(self, method): - np.random.seed(0) self.df = pd.DataFrame(np.random.randn(500, 15)) self.df2 = pd.DataFrame(np.random.randn(500, 15)) self.df_wide = pd.DataFrame(np.random.randn(500, 100))