diff --git a/ci/lint.sh b/ci/lint.sh index 9f582f72fcdd7..144febcfcece5 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -17,7 +17,19 @@ if [ "$LINT" ]; then fi done - echo "Linting DONE" + echo "Linting *.py DONE" + + echo "Linting *.pyx" + for path in 'window.pyx' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126,E128 + if [ $? -ne "0" ]; then + RET=1 + fi + + done + echo "Linting *.pyx DONE" echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 59675e33e724b..12e0ecfba97da 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -391,6 +391,91 @@ For some windowing functions, additional parameters must be specified: such that the weights are normalized with respect to each other. Weights of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. +.. _stats.moments.ts: + +Time-aware Rolling +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce +variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring +within the indicated time delta. + +This can be particularly useful for a non-regular time frequency index. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + +.. _stats.moments.ts-versus-resampling: + +Time-aware Rolling vs. Resampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They +both operate and perform reductive operations on time-indexed pandas objects. + +When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and +aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value +at that point in the result. These are variable sized windows in time-space for each point of the input. You will get +a same sized result as the input. + +When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency +bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this +aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +will have the shape of a regular frequency between the min and the max of the original input object. + +To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. + Centering Windows ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7e832af14c051..0bce0227f4518 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1324,7 +1324,11 @@ performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -``resample`` is a time-based groupby, followed by a reduction method on each of its groups. +``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. + +.. note:: + + ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion `here ` See some :ref:`cookbook examples ` for some advanced strategies diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 688f3b7ff6ada..9b196ec49d685 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -3,16 +3,17 @@ v0.19.0 (August ??, 2016) ------------------------- -This is a major release from 0.18.2 and includes a small number of API changes, several new features, +This is a major release from 0.18.1 and includes a small number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` -.. contents:: What's new in v0.18.2 +.. contents:: What's new in v0.19.0 :local: :backlinks: none @@ -131,6 +132,64 @@ that forward filling happens automatically taking the most recent non-NaN value. This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. +.. _whatsnew_0190.enhancements.rolling_ts: + +``.rolling()`` are now time-series aware +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`) +See the full documentation :ref:`here `. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: :func:`read_csv` has improved support for duplicate column names diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d6e6f571be53a..2fe53d6c27f8f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5343,11 +5343,12 @@ def _add_series_or_dataframe_operations(cls): @Appender(rwindow.rolling.__doc__) def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, axis=0): + win_type=None, on=None, axis=0): axis = self._get_axis_number(axis) return rwindow.rolling(self, window=window, min_periods=min_periods, freq=freq, - center=center, win_type=win_type, axis=axis) + center=center, win_type=win_type, + on=on, axis=axis) cls.rolling = rolling diff --git a/pandas/core/window.py b/pandas/core/window.py index bc4d34529287b..9e2a27adc25a7 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,7 +11,11 @@ import numpy as np from collections import defaultdict -from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.generic import (ABCSeries, + ABCDataFrame, + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex) from pandas.types.common import (is_integer, is_bool, is_float_dtype, @@ -26,11 +30,14 @@ GroupByMixin) import pandas.core.common as com import pandas._window as _window +from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import Substitution, Appender +from pandas.util.decorators import (Substitution, Appender, + cache_readonly) from textwrap import dedent + _shared_docs = dict() _doc_template = """ @@ -47,19 +54,21 @@ class _Window(PandasObject, SelectionMixin): _attributes = ['window', 'min_periods', 'freq', 'center', 'win_type', - 'axis'] + 'axis', 'on'] exclusions = set() def __init__(self, obj, window=None, min_periods=None, freq=None, - center=False, win_type=None, axis=0, **kwargs): + center=False, win_type=None, axis=0, on=None, **kwargs): if freq is not None: warnings.warn("The freq kw is deprecated and will be removed in a " "future version. You can resample prior to passing " "to a window function", FutureWarning, stacklevel=3) + self.__dict__.update(kwargs) self.blocks = [] self.obj = obj + self.on = on self.window = window self.min_periods = min_periods self.freq = freq @@ -72,6 +81,18 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, def _constructor(self): return Window + @property + def is_datetimelike(self): + return None + + @property + def _on(self): + return None + + @property + def is_freq_type(self): + return self.win_type == 'freq' + def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") @@ -83,6 +104,7 @@ def _convert_freq(self, how=None): """ resample according to the how, return a new object """ obj = self._selected_obj + index = None if (self.freq is not None and isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: @@ -92,13 +114,24 @@ def _convert_freq(self, how=None): stacklevel=6) obj = obj.resample(self.freq).aggregate(how or 'asfreq') - return obj + + return obj, index def _create_blocks(self, how): """ split data into blocks & return conformed data """ - obj = self._convert_freq(how) - return obj.as_blocks(copy=False).values(), obj + obj, index = self._convert_freq(how) + if index is not None: + index = self._on + + # filter out the on from the object + if self.on is not None: + if obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), + copy=False) + blocks = obj.as_blocks(copy=False).values() + + return blocks, obj, index def _gotitem(self, key, ndim, subset=None): """ @@ -152,6 +185,21 @@ def __unicode__(self): return "{klass} [{attrs}]".format(klass=self._window_type, attrs=','.join(attrs)) + def _get_index(self, index=None): + """ + Return index as ndarrays + + Returns + ------- + tuple of (index, index_as_ndarray) + """ + + if self.is_freq_type: + if index is None: + index = self._on + return index, index.asi8 + return index, index + def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: @@ -187,8 +235,8 @@ def _wrap_result(self, result, block=None, obj=None): if obj is None: obj = self._selected_obj - index = obj.index + if isinstance(result, np.ndarray): # coerce if necessary @@ -215,6 +263,9 @@ def _wrap_results(self, results, blocks, obj): obj : conformed data (may be resampled) """ + from pandas import Series + from pandas.core.index import _ensure_index + final = [] for result, block in zip(results, blocks): @@ -223,9 +274,31 @@ def _wrap_results(self, results, blocks, obj): return result final.append(result) + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns + if self.on is not None \ + and not self._on.equals(obj.index): + + name = self._on.name + final.append(Series(self._on, index=obj.index, name=name)) + + if self._selection is not None: + + selection = _ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + if not len(final): return obj.astype('float64') - return pd.concat(final, axis=1).reindex(columns=obj.columns) + return pd.concat(final, axis=1).reindex(columns=columns, + copy=False) def _center_window(self, result, window): """ center the result in the window """ @@ -271,18 +344,24 @@ def aggregate(self, arg, *args, **kwargs): class Window(_Window): """ - Provides rolling transformations. + Provides rolling window calculcations. .. versionadded:: 0.18.0 Parameters ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. + window : int, or offset + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. This is + new in 0.19.0 min_periods : int, default None Minimum number of observations in window required to have a value - (otherwise result is NA). + (otherwise result is NA). For a window that is specified by an offset, + this will default to 1. freq : string or DateOffset object, optional (default None) (DEPRECATED) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. @@ -290,11 +369,91 @@ class Window(_Window): Set the labels at the center of the window. win_type : string, default None Provide a window type. See the notes below. - axis : int, default 0 + on : string, optional + For a DataFrame, column on which to calculate + the rolling window, rather than the index + + .. versionadded:: 0.19.0 + + axis : int or string, default 0 Returns ------- - a Window sub-classed for the particular operation + a Window or Rolling sub-classed for the particular operation + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum() + B + 0 NaN + 1 1.0 + 2 2.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicity set the min_periods + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ....: index = [pd.Timestamp('20130101 09:00:00'), + ....: pd.Timestamp('20130101 09:00:02'), + ....: pd.Timestamp('20130101 09:00:03'), + ....: pd.Timestamp('20130101 09:00:05'), + ....: pd.Timestamp('20130101 09:00:06')]) + + >>> df + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 Notes ----- @@ -305,7 +464,10 @@ class Window(_Window): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - The recognized window types are: + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: * ``boxcar`` * ``triang`` @@ -321,7 +483,8 @@ class Window(_Window): * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). -""" + + """ def validate(self): super(Window, self).validate() @@ -329,7 +492,7 @@ def validate(self): window = self.window if isinstance(window, (list, tuple, np.ndarray)): pass - elif com.is_integer(window): + elif is_integer(window): if window < 0: raise ValueError("window must be non-negative") try: @@ -400,7 +563,7 @@ def _apply_window(self, mean=True, how=None, **kwargs): window = self._prep_window(**kwargs) center = self.center - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: @@ -529,7 +692,8 @@ def _apply(self, func, name=None, window=None, center=None, if check_minp is None: check_minp = _use_window - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) + index, indexi = self._get_index(index=index) results = [] for b in blocks: try: @@ -551,9 +715,10 @@ def _apply(self, func, name=None, window=None, center=None, def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) - # GH #12373: rolling functions error on float32 data - return cfunc(_ensure_float64(arg), - window, minp, **kwargs) + # ensure we are only rolling on floats + arg = _ensure_float64(arg) + return cfunc(arg, + window, minp, indexi, **kwargs) # calculation function if center: @@ -587,11 +752,13 @@ class _Rolling_and_Expanding(_Rolling): observations inside provided window.""" def count(self): - obj = self._convert_freq() + + blocks, obj, index = self._create_blocks(how=None) + index, indexi = self._get_index(index=index) + window = self._get_window() window = min(window, len(obj)) if not self.center else window - blocks, obj = self._create_blocks(how=None) results = [] for b in blocks: @@ -625,10 +792,12 @@ def apply(self, func, args=(), kwargs={}): _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) + index, indexi = self._get_index() def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return _window.roll_generic(arg, window, minp, offset, func, args, + return _window.roll_generic(arg, window, minp, indexi, + offset, func, args, kwargs) return self._apply(f, func, args=args, kwargs=kwargs, @@ -695,10 +864,12 @@ def median(self, how=None, **kwargs): def std(self, ddof=1, *args, **kwargs): nv.validate_window_func('std', args, kwargs) window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(_window.roll_var(arg, window, minp, ddof)) + return _zsqrt(_window.roll_var(arg, window, minp, indexi, + ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -740,10 +911,12 @@ def kurt(self, **kwargs): def quantile(self, quantile, **kwargs): window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, quantile) + return _window.roll_quantile(arg, window, minp, indexi, + quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) @@ -823,43 +996,63 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): - """ - Provides rolling window calculcations. - - .. versionadded:: 0.18.0 - Parameters - ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. - Specified as a frequency string or DateOffset object. - center : boolean, default False - Set the labels at the center of the window. - axis : int, default 0 + @cache_readonly + def is_datetimelike(self): + return isinstance(self._on, + (ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex)) + + @cache_readonly + def _on(self): + + if self.on is None: + return self.obj.index + elif (isinstance(self.obj, ABCDataFrame) and + self.on in self.obj.columns): + return pd.Index(self.obj[self.on]) + else: + raise ValueError("invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on)) - Returns - ------- - a Window sub-classed for the particular operation + def validate(self): + super(Rolling, self).validate() - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. + # we allow rolling on a datetimelike index + if (self.is_datetimelike and + isinstance(self.window, (compat.string_types, DateOffset))): - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - """ + # must be monotonic for on + if not self._on.is_monotonic: + formatted = self.on or 'index' + raise ValueError("{0} must be " + "monotonic".format(formatted)) - def validate(self): - super(Rolling, self).validate() - if not is_integer(self.window): + from pandas.tseries.frequencies import to_offset + try: + freq = to_offset(self.window) + except (TypeError, ValueError): + raise ValueError("passed window {0} in not " + "compat with a datetimelike " + "index".format(self.window)) + + # we don't allow center + if self.center: + raise NotImplementedError("center is not implemented " + "for datetimelike and offset " + "based windows") + + # this will raise ValueError on non-fixed freqs + self.window = freq.nanos + self.win_type = 'freq' + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: raise ValueError("window must be non-negative") @@ -876,6 +1069,11 @@ def aggregate(self, arg, *args, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self): + + # different impl for freq counting + if self.is_freq_type: + return self._apply('roll_count', 'count') + return super(Rolling, self).count() @Substitution(name='rolling') @@ -993,12 +1191,31 @@ class Expanding(_Rolling_and_Expanding): Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. - axis : int, default 0 + axis : int or string, default 0 Returns ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + Notes ----- By default, the result is set to the right edge of the window. This can be @@ -1205,6 +1422,25 @@ class EWM(_Rolling): ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + Notes ----- Exactly one of center of mass, span, half-life, and alpha must be provided. @@ -1248,6 +1484,7 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, self.adjust = adjust self.ignore_na = ignore_na self.axis = axis + self.on = None @property def _constructor(self): @@ -1276,7 +1513,7 @@ def _apply(self, func, how=None, **kwargs): y : type of input argument """ - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3693ebdb12e2f..7a35682eee3b0 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -11,7 +11,7 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat) + notnull, concat, Timestamp) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow @@ -101,7 +101,7 @@ def tests_skip_nuisance(self): expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -319,6 +319,13 @@ class TestRolling(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + def test_constructor(self): # GH 12669 @@ -372,6 +379,12 @@ class TestExpanding(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + def test_constructor(self): # GH 12669 @@ -408,6 +421,12 @@ class TestEWM(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + def test_constructor(self): for o in [self.series, self.frame]: c = o.ewm @@ -565,6 +584,7 @@ def _create_data(self): def test_dtypes(self): self._create_data() for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + f = self.funcs[f_name] d = self.data[d_name] exp = self.expects[d_name][f_name] @@ -958,6 +978,7 @@ def test_rolling_median(self): name='median') def test_rolling_min(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_min, np.min, name='min') @@ -970,6 +991,7 @@ def test_rolling_min(self): window=3, min_periods=5) def test_rolling_max(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_max, np.max, name='max') @@ -2890,6 +2912,7 @@ def test_rolling_median_memory_error(self): Series(np.random.randn(n)).rolling(window=2, center=False).median() def test_rolling_min_max_numeric_types(self): + # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] types_test.extend([np.dtype("{}{}".format(sign, width)) @@ -2961,6 +2984,7 @@ def test_rolling(self): r = g.rolling(window=4) for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -3007,6 +3031,7 @@ def test_expanding(self): r = g.expanding() for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) @@ -3047,3 +3072,547 @@ def test_expanding_apply(self): result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) tm.assert_frame_equal(result, expected) + + +class TestRollingTS(tm.TestCase): + + # rolling time-series friendly + # xref GH13327 + + def setUp(self): + + self.regular = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}).set_index('A') + + self.ragged = DataFrame({'B': range(5)}) + self.ragged.index = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=[Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')]) + df + df.rolling('2s').sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with self.assertRaises(ValueError): + df.rolling(window='foobar') + + # not a datetimelike index + with self.assertRaises(ValueError): + df.reset_index().rolling(window='foobar') + + # non-fixed freqs + for freq in ['2MS', pd.offsets.MonthBegin(2)]: + with self.assertRaises(ValueError): + df.rolling(window=freq) + + for freq in ['1D', pd.offsets.Day(2), '2ms']: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, 'foo', np.array([1, 2, 3])]: + with self.assertRaises(ValueError): + df.rolling(window='1D', min_periods=minp) + + # center is not implemented + with self.assertRaises(NotImplementedError): + df.rolling(window='1D', center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with self.assertRaises(ValueError): + df.rolling(window='2s', on='foobar') + + # column is valid + df = df.copy() + df['C'] = pd.date_range('20130101', periods=len(df)) + df.rolling(window='2d', on='C').sum() + + # invalid columns + with self.assertRaises(ValueError): + df.rolling(window='2d', on='B') + + # ok even though on non-selected + df.rolling(window='2d', on='C').B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}) + + self.assertTrue(df.A.is_monotonic) + df.rolling('2s', on='A').sum() + + df = df.set_index('A') + self.assertTrue(df.index.is_monotonic) + df.rolling('2s').sum() + + # non-monotonic + df.index = reversed(df.index.tolist()) + self.assertFalse(df.index.is_monotonic) + + with self.assertRaises(ValueError): + df.rolling('2s').sum() + + df = df.reset_index() + with self.assertRaises(ValueError): + df.rolling('2s', on='A').sum() + + def test_frame_on(self): + + df = DataFrame({'B': range(5), + 'C': pd.date_range('20130101 09:00:00', + periods=5, + freq='3s')}) + + df['A'] = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + # we are doing simulating using 'on' + expected = (df.set_index('A') + .rolling('2s') + .B + .sum() + .reset_index(drop=True) + ) + + result = (df.rolling('2s', on='A') + .B + .sum() + ) + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and reseting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = (df.set_index('A') + .rolling('2s')[['B']] + .sum() + .reset_index()[['B', 'A']] + ) + + result = (df.rolling('2s', on='A')[['B']] + .sum() + ) + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame({'A': [0, 1, 2, 3, 4], + 'B': [0, 1, 2, np.nan, 4], + 'C': pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')])}, + columns=['A', 'C', 'B']) + + expected1 = DataFrame({'A': [0., 1, 3, 3, 7], + 'B': [0, 1, 3, np.nan, 4], + 'C': df['C']}, + columns=['A', 'C', 'B']) + + result = df.rolling('2s', on='C').sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name='B') + result = df.rolling('2s', on='C').B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[['A', 'B', 'C']] + result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = pd.date_range('20130101', periods=5, freq='D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='1D').sum() + tm.assert_frame_equal(result, expected) + + df.index = pd.date_range('20130101', periods=5, freq='2D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window='2D').sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s').sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s').sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=3).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window='1s').count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).count() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [0.0, 1, 2, 3, 4]}).set_index('A') + result = df.rolling('1s').min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [5, 4, 3, 4, 5]}).set_index('A') + + tm.assert_frame_equal(result, expected) + result = df.rolling('2s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling('5s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame({'B': np.random.randn(N)}, + index=pd.date_range('20130101', + periods=N, + freq='s')) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling('2s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling('200s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_ragged_apply(self): + + df = self.ragged + + f = lambda x: 1 + result = df.rolling(window='1s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + def test_all(self): + + # simple comparision of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window='1s') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = r.apply(lambda x: 1) + expected = er.apply(lambda x: 1) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparision of integer vs. + # time-based windowing + df = DataFrame({'B': np.arange(50)}, + index=pd.date_range('20130101', + periods=50, freq='H') + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window='5H') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + expected = df.groupby(df.index.day).apply( + agg_by_day).reset_index(level=0, drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/window.pyx b/pandas/window.pyx index bfe9152477a40..8235d68e2a88b 100644 --- a/pandas/window.pyx +++ b/pandas/window.pyx @@ -1,3 +1,6 @@ +# cython: profile=False +# cython: boundscheck=False, wraparound=False, cdivision=True + from numpy cimport * cimport numpy as np import numpy as np @@ -51,9 +54,10 @@ cdef double nan = NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -# this is our util.pxd from util cimport numeric +from skiplist cimport * + cdef extern from "src/headers/math.h": double sqrt(double x) nogil int signbit(double) nogil @@ -69,16 +73,37 @@ include "skiplist.pyx" # - In Cython x * x is faster than x ** 2 for C types, this should be # periodically revisited to see if it's still true. # -# - -def _check_minp(win, minp, N, floor=1): + +def _check_minp(win, minp, N, floor=None): + """ + Parameters + ---------- + win: int + minp: int or None + N: len of window + floor: int, optional + default 1 + + Returns + ------- + minimum period + """ + + if minp is None: + minp = 1 + if not util.is_integer_object(minp): + raise ValueError("min_periods must be an integer") if minp > win: - raise ValueError('min_periods (%d) must be <= window (%d)' - % (minp, win)) + raise ValueError("min_periods (%d) must be <= " + "window (%d)" % (minp, win)) elif minp > N: minp = N + 1 elif minp < 0: raise ValueError('min_periods must be >= 0') + if floor is None: + floor = 1 + return max(minp, floor) # original C implementation by N. Devillard. @@ -96,757 +121,1227 @@ def _check_minp(win, minp, N, floor=1): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -#------------------------------------------------------------------------------- -# Rolling sum -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_sum(ndarray[double_t] input, int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - sum_x += val +cdef class WindowIndexer: - output[i] = NaN + cdef: + ndarray start, end + int64_t N, minp, win + bint is_variable - for i from minp - 1 <= i < N: - val = input[i] + def get_data(self): + return (self.start, self.end, self.N, + self.win, self.minp, + self.is_variable) - if val == val: - nobs += 1 - sum_x += val - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 +cdef class MockFixedWindowIndexer(WindowIndexer): + """ - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN + We are just checking parameters of the indexer, + and returning a consistent API with fixed/variable + indexers. - return output + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring -#------------------------------------------------------------------------------- -# Rolling mean -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_mean(ndarray[double_t] input, - int win, int minp): - cdef: - double val, prev, result, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0 - Py_ssize_t N = len(input) + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): - cdef ndarray[double_t] output = np.empty(N, dtype=float) - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) + self.start = np.empty(0, dtype='int64') + self.end = np.empty(0, dtype='int64') + self.win = win - # Not NaN - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - output[i] = NaN +cdef class FixedWindowIndexer(WindowIndexer): + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments - for i from minp - 1 <= i < N: - val = input[i] + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring the unit - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): + cdef ndarray start_s, start_e, end_s, end_e - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - if signbit(prev): - neg_ct -= 1 + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) - if nobs >= minp: - result = sum_x / nobs - if neg_ct == 0 and result < 0: - # all positive - output[i] = 0 - elif neg_ct == nobs and result > 0: - # all negative - output[i] = 0 - else: - output[i] = result - else: - output[i] = NaN + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, self.N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e]) - return output + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e]) + self.win = win -#------------------------------------------------------------------------------- -# Exponentially weighted moving average -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): +cdef class VariableWindowIndexer(WindowIndexer): """ - Compute exponentially-weighted moving average using center-of-mass. + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments Parameters ---------- - input : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: ndarray + index of the input - Returns - ------- - y : ndarray """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + ndarray index): - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output + self.is_variable = 1 + self.N = len(index) + self.minp = _check_minp(win, minp, self.N) - minp = max(minp, 1) + self.start = np.empty(self.N, dtype='int64') + self.start.fill(-1) - cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - cdef Py_ssize_t i, nobs + self.end = np.empty(self.N, dtype='int64') + self.end.fill(-1) - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha + self.build(index, win) - weighted_avg = input[0] - is_observation = (weighted_avg == weighted_avg) - nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN - old_wt = 1. + # max window size + self.win = (self.end - self.start).max() - for i from 1 <= i < N: - cur = input[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): - old_wt *= old_wt_factor - if is_observation: - if weighted_avg != cur: # avoid numerical errors on constant series - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur + def build(self, ndarray[int64_t] index, int64_t win): - output[i] = weighted_avg if (nobs >= minp) else NaN + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound, N + Py_ssize_t i, j - return output + start = self.start + end = self.end + N = self.N -#------------------------------------------------------------------------------- -# Exponentially weighted moving covariance + start[0] = 0 + end[0] = 1 -def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + +def get_window_indexer(input, win, minp, index, floor=None, + use_mock=True): """ - Compute exponentially-weighted moving variance using center-of-mass. + return the correct window indexer for the computation Parameters ---------- - input_x : ndarray (float64 type) - input_y : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - bias: int + input: 1d ndarray + win: integer, window size + minp: integer, minimum periods + index: 1d ndarray, optional + index to the input array + floor: optional + unit for flooring the unit + use_mock: boolean, default True + if we are a fixed indexer, return a mock indexer + instead of the FixedWindow Indexer. This is a type + compat Indexer that allows us to use a standard + code path with all of the indexers. Returns ------- - y : ndarray - """ + tuple of 1d int64 ndarrays of the offsets & data about the window - cdef Py_ssize_t N = len(input_x) - if len(input_y) != N: - raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) + """ - cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y - cdef Py_ssize_t i, nobs + if index is not None: + indexer = VariableWindowIndexer(input, win, minp, index) + elif use_mock: + indexer = MockFixedWindowIndexer(input, win, minp, index, floor) + else: + indexer = FixedWindowIndexer(input, win, minp, index, floor) + return indexer.get_data() - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha +# ---------------------------------------------------------------------- +# Rolling count +# this is only an impl for index not None, IOW, freq aware - mean_x = input_x[0] - mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) - nobs = int(is_observation) - if not is_observation: - mean_x = NaN - mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN - cov = 0. - sum_wt = 1. - sum_wt2 = 1. - old_wt = 1. - for i from 1 <= i < N: - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - if mean_x != cur_x: # avoid numerical errors on constant series - mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) - if mean_y != cur_y: # avoid numerical errors on constant series - mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y +def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, count_x = 0.0 + int64_t s, e, nobs, N + Py_ssize_t i, j + ndarray[int64_t] start, end + ndarray[double_t] output - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN - else: - output[i] = cov - else: - output[i] = NaN + start, end, N, win, minp, _ = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - return output + with nogil: -#---------------------------------------------------------------------- -# Rolling variance + for i in range(0, N): + s = start[i] + e = end[i] -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) + if i == 0: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + # setup + count_x = 0.0 + for j in range(s, e): + val = input[j] + if val == val: + count_x += 1.0 - minp = _check_minp(win, minp, N) + else: - # Check for windows larger than array, addresses #7297 - win = min(win, N) + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + count_x -= 1.0 - with nogil: - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: - val = input[i] + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + count_x += 1.0 - # Not NaN - if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 + if count_x >= minp: + output[i] = count_x else: - val = NaN + output[i] = NaN - output[i] = val + return output - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] +# ---------------------------------------------------------------------- +# Rolling sum - if val == val: - if prev == prev: - # Adding one observation and removing another one - delta = val - prev - prev -= mean_x - mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta - else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - elif prev == prev: - # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 - # Variance is unchanged if no observation is added or removed - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - output[i] = val +cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: + cdef double result - return output + if nobs >= minp: + result = sum_x + else: + result = NaN + return result -#------------------------------------------------------------------------------- -# Rolling skewness -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) +cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ add a value from the sum calc """ - # 3 components of the skewness equation - cdef double A, B, C, R + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ remove a value from the sum calc """ - output[i] = NaN + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val - for i from minp - 1 <= i < N: - val = input[i] - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val +def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN - else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster - return output + if is_variable: -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) + # variable window + with nogil: - cdef ndarray[double_t] output = np.empty(N, dtype=float) + for i in range(0, N): + s = start[i] + e = end[i] - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K + if i == 0: - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(input[j], &nobs, &sum_x) - # Not NaN - if val == val: - nobs += 1 + else: - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(input[j], &nobs, &sum_x) - output[i] = NaN + # calculate adds + for j in range(end[i - 1], e): + add_sum(input[j], &nobs, &sum_x) - for i from minp - 1 <= i < N: - val = input[i] + output[i] = calc_sum(minp, nobs, sum_x) - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val + else: - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev + # fixed window - nobs -= 1 + with nogil: - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN + for i in range(0, minp - 1): + add_sum(input[i], &nobs, &sum_x) + output[i] = NaN - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) + for i in range(minp - 1, N): + val = input[i] + add_sum(val, &nobs, &sum_x) - output[i] = K + if i > win - 1: + prev_x = input[i - win] + remove_sum(prev_x, &nobs, &sum_x) - else: - output[i] = NaN + output[i] = calc_sum(minp, nobs, sum_x) return output -#------------------------------------------------------------------------------- -# Rolling median, min, max +# ---------------------------------------------------------------------- +# Rolling mean -from skiplist cimport * -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_median_c(ndarray[float64_t] arg, int win, int minp): - cdef: - double val, res, prev - bint err=0 - int ret=0 - skiplist_t *sl - Py_ssize_t midpoint, nobs = 0, i +cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, double sum_x) nogil: + cdef double result + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + result = 0 + elif neg_ct == nobs and result > 0: + # all negative + result = 0 + else: + pass + else: + result = NaN + return result - cdef Py_ssize_t N = len(arg) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - sl = skiplist_init(win) - if sl == NULL: - raise MemoryError("skiplist_init failed") +cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ add a value from the mean calc """ - minp = _check_minp(win, minp, N) + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val + if signbit(val): + neg_ct[0] = neg_ct[0] + 1 - with nogil: - for i from 0 <= i < minp - 1: - val = arg[i] - # Not NaN - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - output[i] = NaN +cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ remove a value from the mean calc """ + + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val + if signbit(val): + neg_ct[0] = neg_ct[0] - 1 - with nogil: - if not err: - for i from minp - 1 <= i < N: - val = arg[i] +def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, result, sum_x = 0 + int64_t s, e + bint is_variable + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) if i > win - 1: - prev = arg[i - win] + prev_x = input[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + return output + +# ---------------------------------------------------------------------- +# Rolling variance + + +cdef inline double calc_var(int64_t minp, int ddof, double nobs, + double ssqdm_x) nogil: + cdef double result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = ssqdm_x / (nobs - ddof) + if result < 0: + result = 0 + else: + result = NaN + + return result + + +cdef inline void add_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ add a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + +cdef inline void remove_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ remove a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + if nobs[0]: + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] - delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + else: + mean_x[0] = 0 + ssqdm_x[0] = 0 + + +def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, + object index, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + int64_t s, e + bint is_variable + Py_ssize_t i, j, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we + # have 2 paths but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + # calculate deletes + for j in range(start[i - 1], s): + remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + else: + + with nogil: + + # Over the first window, observations can only be added, never + # removed + for i from 0 <= i < win: + add_var(input[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + # After the first window, observations can both be added and + # removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: if prev == prev: - skiplist_remove(sl, prev) - nobs -= 1 + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + + else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output + + +# ---------------------------------------------------------------------- +# Rolling skewness + +cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, + double xxx) nogil: + cdef double result, dnobs + cdef double A, B, C, R + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + B = xx / dnobs - A * A + C = xxx / dnobs - A * A * A - 3 * A * B + if B <= 0 or nobs < 3: + result = NaN + else: + R = sqrt(B) + result = ((sqrt(dnobs * (dnobs - 1.)) * C) / + ((dnobs - 2) * R * R * R)) + else: + result = NaN + + return result + +cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ add a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + +cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ remove a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + + +def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + + if i > win - 1: + prev = input[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling kurtosis + + +cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, + double xxx, double xxxx) nogil: + cdef double result, dnobs + cdef double A, B, C, D, R, K + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + R = A * A + B = xx / dnobs - R + R = R * A + C = xxx / dnobs - R - 3 * A * B + R = R * A + D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A + + if B == 0 or nobs < 4: + result = NaN + else: + K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) + result = K / ((dnobs - 2.) * (dnobs - 3.)) + else: + result = NaN + + return result + +cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ add a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + xxxx[0] = xxxx[0] + val * val * val * val + +cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ remove a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + xxxx[0] = xxxx[0] - val * val * val * val + + +def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + else: + + with nogil: + + for i from 0 <= i < minp - 1: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + + if i > win - 1: + prev = input[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling median, min, max + + +def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, res, prev + bint err=0, is_variable + int ret=0 + skiplist_t *sl + Py_ssize_t i, j + int64_t nobs = 0, N, s, e + int midpoint + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + if sl == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + val = input[i] if val == val: nobs += 1 err = skiplist_insert(sl, val) != 1 if err: break - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - res = skiplist_get(sl, midpoint, &ret) - else: - res = (skiplist_get(sl, midpoint, &ret) + - skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist_remove(sl, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + if nobs >= minp: + midpoint = (nobs / 2) + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) else: - res = NaN + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN - output[i] = res + output[i] = res - skiplist_destroy(sl) + skiplist_destroy(sl) if err: raise MemoryError("skiplist_insert failed") return output -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Moving maximum / minimum code taken from Bottleneck under the terms # of its Simplified BSD license # https://github.com/kwgoodman/bottleneck -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_max(ndarray[numeric] a, int window, int minp): + +cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: + + if numeric in cython.floating: + if ai == ai: + nobs[0] = nobs[0] + 1 + elif is_max: + if numeric == cython.float: + ai = MINfloat32 + else: + ai = MINfloat64 + else: + if numeric == cython.float: + ai = MAXfloat32 + else: + ai = MAXfloat64 + + else: + nobs[0] = nobs[0] + 1 + + return ai + + +cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: + """ remove a value from the mm calc """ + if numeric in cython.floating and aold == aold: + nobs[0] = nobs[0] - 1 + + +cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, + numeric value) nogil: + cdef numeric result + + if numeric in cython.floating: + if nobs >= minp: + result = value + else: + result = NaN + else: + result = value + + return result + + +def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 1) + return _roll_min_max(input, win, minp, index, is_max=1) + -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_min(ndarray[numeric] a, int window, int minp): +def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, + object index): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - a: numpy array + input: numpy array window: int, size of rolling window minp: if number of observations in window is below this, output a NaN + index: ndarray, optional + index for window computation """ - return _roll_min_max(a, window, minp, 0) - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): - "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." - cdef numeric ai, aold - cdef Py_ssize_t count - cdef Py_ssize_t* death - cdef numeric* ring - cdef numeric* minvalue - cdef numeric* end - cdef numeric* last - cdef Py_ssize_t i0 - cdef np.npy_intp *dim - dim = PyArray_DIMS(a) - cdef Py_ssize_t n0 = dim[0] - cdef np.npy_intp *dims = [n0] - cdef bint should_replace - cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) - - if window < 1: - raise ValueError('Invalid window size %d' - % (window)) - - if minp > window: - raise ValueError('Invalid min_periods size %d greater than window %d' - % (minp, window)) - - minp = _check_minp(window, minp, n0) - with nogil: - ring = malloc(window * sizeof(numeric)) - death = malloc(window * sizeof(Py_ssize_t)) - end = ring + window - last = ring + return _roll_min_max(input, win, minp, index, is_max=0) + +cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index, bint is_max): + """ + Moving min/max of 1d array of any numeric type along axis=0 + ignoring NaNs. + """ + + cdef: + numeric ai + bint is_variable, should_replace + int64_t s, e, N, i, j, removed + Py_ssize_t nobs = 0 + ndarray[int64_t] starti, endi + ndarray[numeric, ndim=1] output + cdef: + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + + cdef: + cdef numeric r + + starti, endi, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index) + + output = np.empty(N, dtype=input.dtype) + + if is_variable: + + with nogil: + + for i in range(N): + s = starti[i] + e = endi[i] + + r = input[s] + nobs = 0 + for j in range(s, e): + + # adds, death at the i offset + ai = init_mm(input[j], &nobs, is_max) + + if is_max: + if ai > r: + r = ai + else: + if ai < r: + r = ai + + output[i] = calc_mm(minp, nobs, r) + + else: + + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring minvalue = ring - ai = a[0] - if numeric in cython.floating: - if ai == ai: - minvalue[0] = ai - elif is_max: - minvalue[0] = MINfloat64 - else: - minvalue[0] = MAXfloat64 - else: - minvalue[0] = ai - death[0] = window - - count = 0 - for i0 in range(n0): - ai = a[i0] - if numeric in cython.floating: - if ai == ai: - count += 1 - elif is_max: - ai = MINfloat64 + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 + + with nogil: + + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) + + if i >= win: + remove_mm(input[i - win], &nobs) + + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring + + if is_max: + should_replace = ai >= minvalue[0] else: - ai = MAXfloat64 - else: - count += 1 - if i0 >= window: - aold = a[i0 - window] - if aold == aold: - count -= 1 - if death[minvalue-ring] == i0: - minvalue += 1 - if minvalue >= end: - minvalue = ring - should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] - if should_replace: - minvalue[0] = ai - death[minvalue-ring] = i0 + window - last = minvalue - else: - should_replace = last[0] <= ai if is_max else last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - should_replace = last[0] <= ai if is_max else last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i0 + window - if numeric in cython.floating: - if count >= minp: - y[i0] = minvalue[0] + should_replace = ai <= minvalue[0] + if should_replace: + + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue + else: - y[i0] = NaN - else: - y[i0] = minvalue[0] - for i0 in range(minp - 1): - if numeric in cython.floating: - y[i0] = NaN - else: - y[i0] = 0 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win + + output[i] = calc_mm(minp, nobs, minvalue[0]) + + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 + + free(ring) + free(death) + + # print("output: {0}".format(output)) + return output - free(ring) - free(death) - return y -def roll_quantile(ndarray[float64_t, cast=True] input, int win, - int minp, double quantile): +def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, + int64_t minp, object index, double quantile): """ O(N log(window)) implementation using skip list """ - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - + cdef: + double val, prev, midpoint + IndexableSkiplist skiplist + int64_t nobs = 0, i, j, s, e, N + Py_ssize_t idx + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) skiplist = IndexableSkiplist(win) - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] + for i in range(0, N): + s = start[i] + e = end[i] - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) + if i == 0: - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] + # setup + val = input[i] + if val == val: + nobs += 1 + skiplist.insert(val) - if i > win - 1: - prev = input[i - win] + else: - if prev == prev: - skiplist.remove(prev) - nobs -= 1 + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist.remove(val) + nobs -= 1 - if val == val: - nobs += 1 - skiplist.insert(val) + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + skiplist.insert(val) if nobs >= minp: - idx = int((quantile / 1.) * (nobs - 1)) + idx = int(quantile * (nobs - 1)) output[i] = skiplist.get(idx) else: output[i] = NaN return output + def roll_generic(ndarray[float64_t, cast=True] input, - int win, int minp, int offset, - object func, object args, object kwargs): - cdef ndarray[double_t] output, counts, bufarr - cdef Py_ssize_t i, n - cdef float64_t *buf - cdef float64_t *oldbuf + int64_t win, int64_t minp, object index, + int offset, object func, + object args, object kwargs): + cdef: + ndarray[double_t] output, counts, bufarr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N + bint is_variable + ndarray[int64_t] start, end if not input.flags.c_contiguous: input = input.copy('C') @@ -855,36 +1350,60 @@ def roll_generic(ndarray[float64_t, cast=True] input, if n == 0: return input - minp = _check_minp(win, minp, n, floor=0) - output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index, + floor=0) + output = np.empty(N, dtype=float) - # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, n) - offset): - if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN + counts = roll_sum(np.concatenate([np.isfinite(input).astype(float), + np.array([0.] * offset)]), + win, minp, index)[offset:] - # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf - if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) - else: - output[i] = NaN - bufarr.data = oldbuf + if is_variable: - # truncated windows at the end - for i from int_max(n - offset, 0) <= i < n: - if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) - else: - output[i] = NaN + # variable window + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + output[i] = func(input[s:e], *args, **kwargs) + else: + output[i] = NaN + + else: + + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, N) - offset): + if counts[i] >= minp: + output[i] = func(input[0: (i + offset + 1)], *args, **kwargs) + else: + output[i] = NaN + + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (N - offset): + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + bufarr.data = oldbuf + + # truncated windows at the end + for i from int_max(N - offset, 0) <= i < N: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0): N], + *args, + **kwargs) + else: + output[i] = NaN return output @@ -952,3 +1471,179 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, output[in_i] = NaN return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving average + + +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, + int minp): + """ + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN + old_wt = 1. + + for i from 1 <= i < N: + cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur + + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving covariance + + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + """ + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError("arrays are of different lengths " + "(%d and %d)" % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN + else: + output[i] = cov + else: + output[i] = NaN + + return output diff --git a/setup.py b/setup.py index 650357588570a..bdc54a147bd1a 100755 --- a/setup.py +++ b/setup.py @@ -430,9 +430,9 @@ def pxd(name): 'depends': [srcpath('generated', suffix='.pyx'), srcpath('join', suffix='.pyx')]}, _window={'pyxfile': 'window', - 'pxdfiles': ['src/skiplist','src/util'], - 'depends': ['pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, + 'pxdfiles': ['src/skiplist', 'src/util'], + 'depends': ['pandas/src/skiplist.pyx', + 'pandas/src/skiplist.h']}, parser={'pyxfile': 'parser', 'depends': ['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h',