Skip to content

Commit cce2dbe

Browse files
committed
Fix weighted rolling variance implementation
1 parent 723feb9 commit cce2dbe

File tree

2 files changed

+52
-91
lines changed

2 files changed

+52
-91
lines changed

pandas/_libs/window/aggregations.pyx

Lines changed: 37 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,7 +1558,7 @@ cdef float64_t calc_weighted_var(float64_t t,
15581558
if nobs == 1:
15591559
result = 0
15601560
else:
1561-
result = t * win_n / ((win_n - ddof) * sum_w)
1561+
result = t * nobs / ((nobs - ddof) * sum_w)
15621562
if result < 0:
15631563
result = 0
15641564
else:
@@ -1599,68 +1599,20 @@ cdef void add_weighted_var(float64_t val,
15991599
cdef:
16001600
float64_t temp, q, r
16011601

1602-
if val != val:
1603-
return
1604-
16051602
nobs[0] = nobs[0] + 1
16061603

1607-
q = val - mean[0]
1608-
temp = sum_w[0] + w
1609-
r = q * w / temp
1610-
1611-
mean[0] = mean[0] + r
1612-
t[0] = t[0] + r * sum_w[0] * q
1613-
sum_w[0] = temp
1614-
1615-
1616-
cdef void remove_weighted_var(float64_t val,
1617-
float64_t w,
1618-
float64_t *t,
1619-
float64_t *sum_w,
1620-
float64_t *mean,
1621-
float64_t *nobs) noexcept nogil:
1622-
"""
1623-
Update weighted mean, sum of weights and sum of weighted squared
1624-
differences to remove value and weight pair from weighted variance
1625-
calculation using West's method.
1626-
1627-
Paper: https://dl.acm.org/citation.cfm?id=359153
1628-
1629-
Parameters
1630-
----------
1631-
val: float64_t
1632-
window values
1633-
w: float64_t
1634-
window weights
1635-
t: float64_t
1636-
sum of weighted squared differences
1637-
sum_w: float64_t
1638-
sum of weights
1639-
mean: float64_t
1640-
weighted mean
1641-
nobs: float64_t
1642-
number of observations
1643-
"""
1644-
1645-
cdef:
1646-
float64_t temp, q, r
1647-
1648-
if val == val:
1649-
nobs[0] = nobs[0] - 1
1650-
1651-
if nobs[0]:
1652-
q = val - mean[0]
1653-
temp = sum_w[0] - w
1654-
r = q * w / temp
1604+
if nobs[0] == 1:
1605+
sum_w[0] = w
1606+
mean[0] = val
16551607

1656-
mean[0] = mean[0] - r
1657-
t[0] = t[0] - r * sum_w[0] * q
1658-
sum_w[0] = temp
1608+
else:
1609+
q = val - mean[0]
1610+
temp = sum_w[0] + w
1611+
r = q * w / temp
16591612

1660-
else:
1661-
t[0] = 0
1662-
sum_w[0] = 0
1663-
mean[0] = 0
1613+
mean[0] = mean[0] + r
1614+
t[0] = t[0] + r * sum_w[0] * q
1615+
sum_w[0] = temp
16641616

16651617

16661618
def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
@@ -1690,44 +1642,38 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
16901642
"""
16911643

16921644
cdef:
1693-
float64_t t = 0, sum_w = 0, mean = 0, nobs = 0
1694-
float64_t val, pre_val, w, pre_w
1695-
Py_ssize_t i, n, win_n
1696-
float64_t[:] output
1645+
float64_t val, w
1646+
Py_ssize_t in_i, win_i, add_i, n, win_n
1647+
float64_t[:] output, t, mean, sum_w, nobs
16971648

16981649
n = len(values)
16991650
win_n = len(weights)
1651+
17001652
output = np.empty(n, dtype=np.float64)
1653+
t = np.zeros(n, dtype=np.float64)
1654+
mean = np.zeros(n, dtype=np.float64)
1655+
sum_w = np.zeros(n, dtype=np.float64)
1656+
nobs = np.zeros(n, dtype=np.float64)
17011657

17021658
with nogil:
1703-
1704-
for i in range(min(win_n, n)):
1705-
add_weighted_var(values[i], weights[i], &t,
1706-
&sum_w, &mean, &nobs)
1707-
1708-
output[i] = calc_weighted_var(t, sum_w, win_n,
1709-
ddof, nobs, minp)
1710-
1711-
for i in range(win_n, n):
1712-
val = values[i]
1713-
pre_val = values[i - win_n]
1714-
1715-
w = weights[i % win_n]
1716-
pre_w = weights[(i - win_n) % win_n]
1717-
1718-
if val == val:
1719-
if pre_val == pre_val:
1720-
remove_weighted_var(pre_val, pre_w, &t,
1721-
&sum_w, &mean, &nobs)
1722-
1723-
add_weighted_var(val, w, &t, &sum_w, &mean, &nobs)
1724-
1725-
elif pre_val == pre_val:
1726-
remove_weighted_var(pre_val, pre_w, &t,
1727-
&sum_w, &mean, &nobs)
1728-
1729-
output[i] = calc_weighted_var(t, sum_w, win_n,
1730-
ddof, nobs, minp)
1659+
for win_i in range(win_n):
1660+
w = weights[win_i]
1661+
if w != w:
1662+
continue
1663+
1664+
for in_i in range(n - (win_n - win_i) + 1):
1665+
val = values[in_i]
1666+
1667+
if val == val:
1668+
add_i = in_i + (win_n - win_i) - 1
1669+
add_weighted_var(
1670+
val, w, &t[add_i], &sum_w[add_i], &mean[add_i], &nobs[add_i]
1671+
)
1672+
1673+
for in_i in range(n):
1674+
output[in_i] = calc_weighted_var(
1675+
t[in_i], sum_w[in_i], win_n, ddof, nobs[in_i], minp
1676+
)
17311677

17321678
return output
17331679

pandas/tests/window/test_win_type.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,21 @@ def test_cmov_window_special_linear_range(win_types_special, step):
661661
tm.assert_series_equal(xp, rs)
662662

663663

664+
@pytest.mark.parametrize("size", [10, 100, 1000])
665+
@pytest.mark.parametrize("scale", [0, 0.1, 0.01, 0.001, 0.0001])
666+
def test_weighted_std_through_mean(win_types, step, scale, size):
667+
# GH 53273
668+
s = Series(np.random.default_rng(0).normal(loc=1, scale=scale, size=size))
669+
670+
squared_mean = (s**2).rolling(5, win_type=win_types).mean()
671+
mean = s.rolling(5, win_type=win_types).mean()
672+
expected = (squared_mean - mean**2).pow(0.5)
673+
674+
result = s.rolling(5, win_type=win_types).std(ddof=0)
675+
676+
tm.assert_series_equal(expected, result)
677+
678+
664679
def test_weighted_var_big_window_no_segfault(win_types, center):
665680
# GitHub Issue #46772
666681
pytest.importorskip("scipy")

0 commit comments

Comments
 (0)