From c9ff80023fb329444a2e12cb4e5b67843535a5f7 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Mon, 7 Jun 2021 15:33:16 -0400
Subject: [PATCH 1/4] precommit fixup

---
 pandas/_libs/algos.pyx | 95 ++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 45 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 6cc55648b9cf4..f2efeedb80d4d 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -383,8 +383,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
         Py_ssize_t i, j, xi, yi, N, K
         ndarray[float64_t, ndim=2] result
         ndarray[float64_t, ndim=2] ranked_mat
-        ndarray[float64_t, ndim=1] maskedx
-        ndarray[float64_t, ndim=1] maskedy
+        ndarray[float64_t, ndim=1] rankedx, rankedy
+        float64_t[::1] maskedx, maskedy
         ndarray[uint8_t, ndim=2] mask
         int64_t nobs = 0
         float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
@@ -399,56 +399,61 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
 
+    # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
+    # here since N >= nobs and values are stored contiguously
+    maskedx = np.empty(N, dtype=np.float64)
+    maskedy = np.empty(N, dtype=np.float64)
     for i in range(K):
         ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
 
-    for xi in range(K):
-        for yi in range(xi + 1):
-            nobs = 0
-            # Keep track of whether we need to recompute ranks
-            all_ranks = True
-            for i in range(N):
-                all_ranks &= not (mask[i, xi] ^ mask[i, yi])
-                if mask[i, xi] and mask[i, yi]:
-                    nobs += 1
-
-            if nobs < minp:
-                result[xi, yi] = result[yi, xi] = NaN
-            else:
-                maskedx = np.empty(nobs, dtype=np.float64)
-                maskedy = np.empty(nobs, dtype=np.float64)
-                j = 0
-
+    with nogil:
+        for xi in range(K):
+            for yi in range(xi + 1):
+                nobs = 0
+                # Keep track of whether we need to recompute ranks
+                all_ranks = True
                 for i in range(N):
+                    all_ranks &= not (mask[i, xi] ^ mask[i, yi])
                     if mask[i, xi] and mask[i, yi]:
-                        maskedx[j] = ranked_mat[i, xi]
-                        maskedy[j] = ranked_mat[i, yi]
-                        j += 1
-
-                if not all_ranks:
-                    labels_nobs = np.zeros(nobs, dtype=np.int64)
-                    maskedx = rank_1d(maskedx, labels=labels_nobs)
-                    maskedy = rank_1d(maskedy, labels=labels_nobs)
-
-                mean = (nobs + 1) / 2.
-
-                # now the cov numerator
-                sumx = sumxx = sumyy = 0
-
-                for i in range(nobs):
-                    vx = maskedx[i] - mean
-                    vy = maskedy[i] - mean
-
-                    sumx += vx * vy
-                    sumxx += vx * vx
-                    sumyy += vy * vy
-
-                divisor = sqrt(sumxx * sumyy)
+                        maskedx[nobs] = ranked_mat[i, xi]
+                        maskedy[nobs] = ranked_mat[i, yi]
+                        nobs += 1
 
-                if divisor != 0:
-                    result[xi, yi] = result[yi, xi] = sumx / divisor
-                else:
+                if nobs < minp:
                     result[xi, yi] = result[yi, xi] = NaN
+                else:
+                    if not all_ranks:
+                        with gil:
+                            # We need to slice back to nobs because rank_1d will
+                            # require arrays of nobs length
+                            labels_nobs = np.zeros(nobs, dtype=np.int64)
+                            rankedx = rank_1d(np.array(maskedx)[:nobs],
+                                              labels=labels_nobs)
+                            rankedy = rank_1d(np.array(maskedy)[:nobs],
+                                              labels=labels_nobs)
+                        for i in range(nobs):
+                            maskedx[i] = rankedx[i]
+                            maskedy[i] = rankedy[i]
+
+                    mean = (nobs + 1) / 2.
+
+                    # now the cov numerator
+                    sumx = sumxx = sumyy = 0
+
+                    for i in range(nobs):
+                        vx = maskedx[i] - mean
+                        vy = maskedy[i] - mean
+
+                        sumx += vx * vy
+                        sumxx += vx * vx
+                        sumyy += vy * vy
+
+                    divisor = sqrt(sumxx * sumyy)
+
+                    if divisor != 0:
+                        result[xi, yi] = result[yi, xi] = sumx / divisor
+                    else:
+                        result[xi, yi] = result[yi, xi] = NaN
 
     return result
 

From d60902afe196d43e97d0a7d4f0260e6a8c5c446c Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Mon, 7 Jun 2021 15:41:33 -0400
Subject: [PATCH 2/4] Add benchmark seed for stability

---
 asv_bench/benchmarks/stat_ops.py | 3 ++-
 doc/source/whatsnew/v1.3.0.rst   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 5639d6702a92c..64842286ac07c 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -95,10 +95,11 @@ def time_average_old(self, constructor, pct):
 
 class Correlation:
 
-    params = [["spearman", "kendall", "pearson"]]
+    params = [["spearman"]]
     param_names = ["method"]
 
     def setup(self, method):
+        np.random.seed(0)
         self.df = pd.DataFrame(np.random.randn(500, 15))
         self.df2 = pd.DataFrame(np.random.randn(500, 15))
         self.df_wide = pd.DataFrame(np.random.randn(500, 100))
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 8b413808503ad..d24aaf1affccc 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -806,6 +806,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
 - Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
 - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
+- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
 - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
 - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
 - Performance improvement in :func:`unique` for object data type (:issue:`37615`)

From 88860595bfe245b95a8db64e31b441e15d2f04af Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Mon, 7 Jun 2021 15:50:00 -0400
Subject: [PATCH 3/4] Add back all bench methods

---
 asv_bench/benchmarks/stat_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 64842286ac07c..ddba88db830f0 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -95,7 +95,7 @@ def time_average_old(self, constructor, pct):
 
 class Correlation:
 
-    params = [["spearman"]]
+    params = [["spearman", "kendall", "pearson"]]
     param_names = ["method"]
 
     def setup(self, method):

From ff9519f0cd74c03dca269e0a4a4c45244d89fbbd Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Tue, 8 Jun 2021 12:27:39 -0400
Subject: [PATCH 4/4] Remove random seed

---
 asv_bench/benchmarks/stat_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index ddba88db830f0..5639d6702a92c 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -99,7 +99,6 @@ class Correlation:
     param_names = ["method"]
 
     def setup(self, method):
-        np.random.seed(0)
         self.df = pd.DataFrame(np.random.randn(500, 15))
         self.df2 = pd.DataFrame(np.random.randn(500, 15))
         self.df_wide = pd.DataFrame(np.random.randn(500, 100))