diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 287ec69f65cd6..757f713ad0767 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -858,6 +858,7 @@ Performance improvements - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`) - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`) - Performance improvement in :func:`unique` for object data type (:issue:`37615`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6cc55648b9cf4..f2efeedb80d4d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -383,8 +383,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat - ndarray[float64_t, ndim=1] maskedx - ndarray[float64_t, ndim=1] maskedy + ndarray[float64_t, ndim=1] rankedx, rankedy + float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor @@ -399,56 +399,61 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr ranked_mat = np.empty((N, K), dtype=np.float64) + # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe + # here since N >= nobs and values are stored contiguously + maskedx = np.empty(N, dtype=np.float64) + maskedy = np.empty(N, dtype=np.float64) for i in range(K): ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) - for xi in range(K): - for yi in range(xi + 1): - nobs = 0 - # Keep track of whether we need to recompute ranks - all_ranks = True - for i in range(N): - all_ranks &= not (mask[i, xi] ^ mask[i, yi]) - if mask[i, xi] and mask[i, yi]: - nobs += 1 - - if nobs < minp: - result[xi, yi] = result[yi, xi] = NaN - else: - maskedx = np.empty(nobs, dtype=np.float64) - maskedy = np.empty(nobs, dtype=np.float64) - j = 0 - + with nogil: + for xi in range(K): + for yi in range(xi + 1): + nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] - maskedy[j] = ranked_mat[i, yi] - j += 1 - - if not all_ranks: - labels_nobs = np.zeros(nobs, dtype=np.int64) - maskedx = rank_1d(maskedx, labels=labels_nobs) - maskedy = rank_1d(maskedy, labels=labels_nobs) - - mean = (nobs + 1) / 2. - - # now the cov numerator - sumx = sumxx = sumyy = 0 - - for i in range(nobs): - vx = maskedx[i] - mean - vy = maskedy[i] - mean - - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy - - divisor = sqrt(sumxx * sumyy) + maskedx[nobs] = ranked_mat[i, xi] + maskedy[nobs] = ranked_mat[i, yi] + nobs += 1 - if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor - else: + if nobs < minp: result[xi, yi] = result[yi, xi] = NaN + else: + if not all_ranks: + with gil: + # We need to slice back to nobs because rank_1d will + # require arrays of nobs length + labels_nobs = np.zeros(nobs, dtype=np.int64) + rankedx = rank_1d(np.array(maskedx)[:nobs], + labels=labels_nobs) + rankedy = rank_1d(np.array(maskedy)[:nobs], + labels=labels_nobs) + for i in range(nobs): + maskedx[i] = rankedx[i] + maskedy[i] = rankedy[i] + + mean = (nobs + 1) / 2. + + # now the cov numerator + sumx = sumxx = sumyy = 0 + + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = NaN return result