diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 07c856c96426d..4089f9523724f 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -178,6 +178,75 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin` For ``DataFrames``, likewise, ``in`` applies to the column axis, testing for membership in the list of column names. +.. _udf-mutation: + +Mutating with User Defined Function (UDF) methods +------------------------------------------------- + +It is a general rule in programming that one should not mutate a container +while it is being iterated over. Mutation will invalidate the iterator, +causing unexpected behavior. Consider the example: + +.. ipython:: python + + values = [0, 1, 2, 3, 4, 5] + n_removed = 0 + for k, value in enumerate(values): + idx = k - n_removed + if value % 2 == 1: + del values[idx] + n_removed += 1 + else: + values[idx] = value + 1 + values + +One probably would have expected that the result would be ``[1, 3, 5]``. +When using a pandas method that takes a UDF, internally pandas is often +iterating over the +``DataFrame`` or other pandas object. Therefore, if the UDF mutates (changes) +the ``DataFrame``, unexpected behavior can arise. + +Here is a similar example with :meth:`DataFrame.apply`: + +.. ipython:: python + + def f(s): + s.pop("a") + return s + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + try: + df.apply(f, axis="columns") + except Exception as err: + print(repr(err)) + +To resolve this issue, one can make a copy so that the mutation does +not apply to the container being iterated over. + +.. ipython:: python + + values = [0, 1, 2, 3, 4, 5] + n_removed = 0 + for k, value in enumerate(values.copy()): + idx = k - n_removed + if value % 2 == 1: + del values[idx] + n_removed += 1 + else: + values[idx] = value + 1 + values + +.. ipython:: python + + def f(s): + s = s.copy() + s.pop("a") + return s + + df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]}) + df.apply(f, axis="columns") + + ``NaN``, Integer ``NA`` values and ``NA`` type promotions --------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 63d238da12101..2b09853fc2442 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7814,6 +7814,12 @@ def apply( DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a7297923f1034..a658dcf259a81 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -580,6 +580,12 @@ def filter(self, func, dropna=True, *args, **kwargs): dropna : Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', @@ -1506,6 +1512,10 @@ def filter(self, func, dropna=True, *args, **kwargs): Each subframe is endowed the attribute 'name' in case you need to know which group you are working on. + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5758762c13984..66e7bc78b2f81 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -344,7 +344,7 @@ class providing the base-class of operations. in the subframe. If f also supports application to the entire subframe, then a fast path is used starting from the second chunk. * f must not mutate groups. Mutation is not supported and may - produce unexpected results. + produce unexpected results. See :ref:`udf-mutation` for more details. When using ``engine='numba'``, there will be no "fall back" behavior internally. The group data and group index will be passed as numpy arrays to the JITed @@ -447,6 +447,10 @@ class providing the base-class of operations. The group data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. {examples} + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 7d97c9f6189f3..4d2203ff5621a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4044,6 +4044,12 @@ def apply( Series.agg: Only perform aggregating type operations. Series.transform: Only perform transforming type operations. + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`udf-mutation` + for more details. + Examples -------- Create a series with typical summer temperatures for each city. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index ad2eafe7295b0..49eb87a3bc8ba 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -41,6 +41,10 @@ ----- `agg` is an alias for `aggregate`. Use the alias. +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. + A passed user-defined-function will be passed a Series for evaluation. {examples}""" @@ -296,6 +300,12 @@ {klass}.agg : Only perform aggregating type operations. {klass}.apply : Invoke function on a {klass}. +Notes +----- +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`udf-mutation` +for more details. + Examples -------- >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}})