From 361325c226d441dd5cbb279a96511569b1c67bce Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 27 Jul 2021 21:23:04 -0400 Subject: [PATCH 1/3] PERF: nancorr --- pandas/_libs/algos.pyx | 57 ++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 46ccf5d3f9fa9..f3cda80d8879c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -326,8 +326,11 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv float64_t[:, ::1] result + float64_t[::1] means=None, ssqds=None ndarray[uint8_t, ndim=2] mask + bint no_nans int64_t nobs = 0 + float64_t mean, ssqd, val float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy N, K = (mat).shape @@ -339,25 +342,53 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + no_nans = mask.all() + if no_nans: + means = np.empty(K, dtype=np.float64) + ssqds = np.empty(K, dtype=np.float64) + + with nogil: + for j in range(K): + ssqd = mean = 0 + for i in range(N): + val = mat[i, j] + dx = val - mean + mean += 1 / (i + 1) * dx + ssqd += (val - mean) * dx + + means[j] = mean + ssqds[j] = ssqd with nogil: for xi in range(K): for yi in range(xi + 1): - # Welford's method for the variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 - for i in range(N): - if mask[i, xi] and mask[i, yi]: + covxy = 0 + if no_nans: + for i in range(N): vx = mat[i, xi] vy = mat[i, yi] - nobs += 1 - dx = vx - meanx - dy = vy - meany - meanx += 1 / nobs * dx - meany += 1 / nobs * dy - ssqdmx += (vx - meanx) * dx - ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy + covxy += (vx - means[xi]) * (vy - means[yi]) + + ssqdmx = ssqds[xi] + ssqdmy = ssqds[yi] + nobs = N + + else: + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 + for i in range(N): + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + dx = vx - meanx + dy = vy - meany + meanx += 1 / nobs * dx + meany += 1 / nobs * dy + ssqdmx += (vx - meanx) * dx + ssqdmy += (vy - meany) * dy + covxy += (vx - meanx) * dy if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN From c9a9958545d07b9bdfd45596174a839c04fb88a1 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 27 Jul 2021 21:41:42 -0400 Subject: [PATCH 2/3] Add whatsnew and some comments --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/_libs/algos.pyx | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d8d00db47e03d..9da88de12fca7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -167,6 +167,8 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`, :issue:`41885`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index f3cda80d8879c..f0b896b071723 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -326,6 +326,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv float64_t[:, ::1] result + # Initialize to None since we only use in the no missing value case float64_t[::1] means=None, ssqds=None ndarray[uint8_t, ndim=2] mask bint no_nans @@ -343,6 +344,10 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) no_nans = mask.all() + + # Computing the online means and variances is expensive - so if possible we can + # precompute these and avoid repeating the computations each time we handle + # an (xi, yi) pair if no_nans: means = np.empty(K, dtype=np.float64) ssqds = np.empty(K, dtype=np.float64) From 655e6d8b1211d51c3035d3bffd6487e17976c1b8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 27 Jul 2021 21:59:39 -0400 Subject: [PATCH 3/3] Fix issue ref --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9da88de12fca7..dbb0ae0b1e566 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -167,7 +167,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) -- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`, :issue:`41885`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) - .. ---------------------------------------------------------------------------