diff --git a/Makefile b/Makefile index a02fe145c5f0e..baceefe6d49ff 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ .PHONY : develop build clean clean_pyc doc lint-diff black +all: develop + clean: -python setup.py clean diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 40256e043a008..c5e099bd44eac 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -10,7 +10,7 @@ pass -class DoesStringLookLikeDatetime(object): +class DoesStringLookLikeDatetime: params = (["2Q2005", "0.0", "10000"],) param_names = ["value"] @@ -23,7 +23,7 @@ def time_check_datetimes(self, value): _does_string_look_like_datetime(obj) -class ConcatDateCols(object): +class ConcatDateCols: params = ([1234567890, "AAAA"], [1, 2]) param_names = ["value", "dim"] diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index f41e13163b3f5..1aed756b841a5 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -240,4 +240,17 @@ def time_qcut_datetime(self, bins): pd.qcut(self.datetime_series, bins) +class Explode: + param_names = ["n_rows", "max_list_length"] + params = [[100, 1000, 10000], [3, 5, 10]] + + def setup(self, n_rows, max_list_length): + + data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)] + self.series = pd.Series(data) + + def time_explode(self, n_rows, max_list_length): + self.series.explode() + + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index e2835c5156f55..6038a2ab4bd9f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -219,7 +219,7 @@ def time_series_datetimeindex_repr(self): getattr(self.s, "a", None) -class All(object): +class All: params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] param_names = ["N", "case"] @@ -232,7 +232,7 @@ def time_all(self, N, case): self.s.all() -class Any(object): +class Any: params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] param_names = ["N", "case"] @@ -245,7 +245,7 @@ def time_any(self, N, case): self.s.any() -class NanOps(object): +class NanOps: params = [ [ diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index a74527df25f9b..1020b773f8acb 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -293,7 +293,7 @@ def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format="%Y%m%d") -class ToDatetimeCacheSmallCount(object): +class ToDatetimeCacheSmallCount: params = ([True, False], [50, 500, 5000, 100000]) param_names = ["cache", "count"] diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fec2a88292280..96a8440d85694 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -156,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts + invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c0b58fd2d99f5..b1c6172fb1261 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -239,6 +239,7 @@ Reshaping, sorting, transposing DataFrame.unstack DataFrame.swapaxes DataFrame.melt + DataFrame.explode DataFrame.squeeze DataFrame.to_xarray DataFrame.T diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8d2a764c33a43..7ba625c141f24 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -245,6 +245,7 @@ Reshaping, sorting Series.sort_index Series.swaplevel Series.unstack + Series.explode Series.searchsorted Series.ravel Series.repeat @@ -590,4 +591,3 @@ Sparse SparseSeries.to_coo SparseSeries.from_coo - diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b7b6dd0a69c24..0470a6c0c2f42 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -801,3 +801,53 @@ Note to subdivide over multiple columns we can pass in a list to the df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +.. _reshaping.explode: + +Exploding a list-like column +---------------------------- + +.. versionadded:: 0.25.0 + +Sometimes the values in a column are list-like. + +.. ipython:: python + + keys = ['panda1', 'panda2', 'panda3'] + values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] + df = pd.DataFrame({'keys': keys, 'values': values}) + df + +We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row: + +.. ipython:: python + + df['values'].explode() + +You can also explode the column in the ``DataFrame``. + +.. ipython:: python + + df.explode('values') + +:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. + +.. ipython:: python + + s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) + s + s.explode() + +Here is a typical usecase. You have comma separated strings in a column and want to expand this. + +.. ipython:: python + + df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) + df + +Creating a long form DataFrame is now straightforward using explode and chained operations + +.. ipython:: python + + df.assign(var1=df.var1.str.split(',')).explode('var1') diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 59cd6615b7395..a08159e6c3199 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -182,6 +182,28 @@ The repr now looks like this: json_normalize(data, max_level=1) +.. _whatsnew_0250.enhancements.explode: + +Series.explode to split list-like values to rows +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`, :issue:`10511`) + + +Here is a typical usecase. You have comma separated string in a column. + +.. ipython:: python + + df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, + {'var1': 'd,e,f', 'var2': 2}]) + df + +Creating a long form ``DataFrame`` is now straightforward using chained operations + +.. ipython:: python + + df.assign(var1=df.var1.str.split(',')).explode('var1') + .. _whatsnew_0250.enhancements.other: Other enhancements diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd new file mode 100644 index 0000000000000..12aca9dabe2e7 --- /dev/null +++ b/pandas/_libs/lib.pxd @@ -0,0 +1 @@ +cdef bint c_is_list_like(object, bint) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1936404b75602..27ee685acfde7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,3 +1,4 @@ +from collections import abc from decimal import Decimal from fractions import Fraction from numbers import Number @@ -886,6 +887,60 @@ def is_period(val: object) -> bool: return util.is_period_object(val) +def is_list_like(obj: object, allow_sets: bool = True): + """ + Check if the object is list-like. + + Objects that are considered list-like are for example Python + lists, tuples, sets, NumPy arrays, and Pandas Series. + + Strings and datetime objects, however, are not considered list-like. + + Parameters + ---------- + obj : The object to check + allow_sets : boolean, default True + If this parameter is False, sets will not be considered list-like + + .. versionadded:: 0.24.0 + + Returns + ------- + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_list_like([1, 2, 3]) + True + >>> is_list_like({1, 2, 3}) + True + >>> is_list_like(datetime(2017, 1, 1)) + False + >>> is_list_like("foo") + False + >>> is_list_like(1) + False + >>> is_list_like(np.array([2])) + True + >>> is_list_like(np.array(2))) + False + """ + return c_is_list_like(obj, allow_sets) + + +cdef inline bint c_is_list_like(object obj, bint allow_sets): + return ( + isinstance(obj, abc.Iterable) + # we do not count strings/unicode/bytes as list-like + and not isinstance(obj, (str, bytes)) + # exclude zero-dimensional numpy arrays, effectively scalars + and not (util.is_array(obj) and obj.ndim == 0) + # exclude sets if allow_sets is False + and not (allow_sets is False and isinstance(obj, abc.Set)) + ) + + _TYPE_MAP = { 'categorical': 'categorical', 'category': 'categorical', diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 35b2ab4aa5326..f229de002ce5c 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -2,8 +2,11 @@ import cython from cython import Py_ssize_t from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t) - + uint32_t, uint64_t, float32_t, float64_t, ndarray) +cimport numpy as cnp +import numpy as np +from pandas._libs.lib cimport c_is_list_like +cnp.import_array() ctypedef fused reshape_t: uint8_t @@ -91,3 +94,59 @@ unstack_int64 = unstack["int64_t"] unstack_float32 = unstack["float32_t"] unstack_float64 = unstack["float64_t"] unstack_object = unstack["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def explode(ndarray[object] values): + """ + transform array list-likes to long form + preserve non-list entries + + Parameters + ---------- + values : object ndarray + + Returns + ------- + tuple(values, counts) + """ + cdef: + Py_ssize_t i, j, count, n + object v + ndarray[object] result + ndarray[int64_t] counts + + # find the resulting len + n = len(values) + counts = np.zeros(n, dtype='int64') + for i in range(n): + v = values[i] + if c_is_list_like(v, False): + if len(v): + counts[i] += len(v) + else: + # empty list-like, use a nan marker + counts[i] += 1 + else: + counts[i] += 1 + + result = np.empty(counts.sum(), dtype='object') + count = 0 + for i in range(n): + v = values[i] + + if c_is_list_like(v, False): + if len(v): + for j in range(len(v)): + result[count] = v[j] + count += 1 + else: + # empty list-like, use a nan marker + result[count] = np.nan + count += 1 + else: + # replace with the existing scalar + result[count] = v + count += 1 + return result, counts diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 9373ea18e8a24..461b5cc6232cd 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -23,6 +23,8 @@ is_interval = lib.is_interval +is_list_like = lib.is_list_like + def is_number(obj): """ @@ -241,60 +243,6 @@ def is_re_compilable(obj): return True -def is_list_like(obj, allow_sets=True): - """ - Check if the object is list-like. - - Objects that are considered list-like are for example Python - lists, tuples, sets, NumPy arrays, and Pandas Series. - - Strings and datetime objects, however, are not considered list-like. - - Parameters - ---------- - obj : The object to check - allow_sets : boolean, default True - If this parameter is False, sets will not be considered list-like - - .. versionadded:: 0.24.0 - - Returns - ------- - is_list_like : bool - Whether `obj` has list-like properties. - - Examples - -------- - >>> is_list_like([1, 2, 3]) - True - >>> is_list_like({1, 2, 3}) - True - >>> is_list_like(datetime(2017, 1, 1)) - False - >>> is_list_like("foo") - False - >>> is_list_like(1) - False - >>> is_list_like(np.array([2])) - True - >>> is_list_like(np.array(2))) - False - """ - - return ( - isinstance(obj, abc.Iterable) - and - # we do not count strings/unicode/bytes as list-like - not isinstance(obj, (str, bytes)) - and - # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0) - and - # exclude sets if allow_sets is False - not (allow_sets is False and isinstance(obj, abc.Set)) - ) - - def is_array_like(obj): """ Check if the object is array-like. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f45a13249b16c..c15f4ad8e1900 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,7 +15,7 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, List, Optional, Set, Tuple, Type, Union import warnings import numpy as np @@ -6237,6 +6237,75 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) + def explode(self, column: Union[str, Tuple]) -> "DataFrame": + """ + Transform each element of a list-like to a row, replicating the + index values. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + column : str or tuple + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + + Raises + ------ + ValueError : + if columns of the frame are not unique. + + See Also + -------- + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels + DataFrame.melt : Unpivot a DataFrame from wide format to long format + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + >>> df + A B + 0 [1, 2, 3] 1 + 1 foo 1 + 2 [] 1 + 3 [3, 4] 1 + + >>> df.explode('A') + A B + 0 1 1 + 0 2 1 + 0 3 1 + 1 foo 1 + 2 NaN 1 + 3 3 1 + 3 4 1 + """ + + if not (is_scalar(column) or isinstance(column, tuple)): + raise ValueError("column must be a scalar") + if not self.columns.is_unique: + raise ValueError("columns must be unique") + + result = self[column].explode() + return ( + self.drop([column], axis=1) + .join(result) + .reindex(columns=self.columns, copy=False) + ) + def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning @@ -6339,6 +6408,7 @@ def unstack(self, level=-1, fill_value=None): %(other)s pivot_table DataFrame.pivot + Series.explode Examples -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e084f99ec5a2c..7bbd30e0c28b1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -963,6 +963,7 @@ def _assert_take_fillable( @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): + repeats = ensure_platform_int(repeats) nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 670a4666a3440..b673c119c0498 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2091,10 +2091,11 @@ def argsort(self, *args, **kwargs): @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) + repeats = ensure_platform_int(repeats) return MultiIndex( levels=self.levels, codes=[ - level_codes.view(np.ndarray).repeat(repeats) + level_codes.view(np.ndarray).astype(np.intp).repeat(repeats) for level_codes in self.codes ], names=self.names, diff --git a/pandas/core/series.py b/pandas/core/series.py index 46b96c1ece77c..8082069efce3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -12,7 +12,7 @@ from pandas._config import get_option -from pandas._libs import iNaT, index as libindex, lib, tslibs +from pandas._libs import iNaT, index as libindex, lib, reshape, tslibs from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate @@ -33,6 +33,7 @@ is_integer, is_iterator, is_list_like, + is_object_dtype, is_scalar, is_string_like, is_timedelta64_dtype, @@ -2007,7 +2008,7 @@ def drop_duplicates(self, keep="first", inplace=False): Examples -------- - Generate an Series with duplicated entries. + Generate a Series with duplicated entries. >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], ... name='animal') @@ -3635,6 +3636,62 @@ def reorder_levels(self, order): result.index = result.index.reorder_levels(order) return result + def explode(self) -> "Series": + """ + Transform each element of a list-like to a row, replicating the + index values. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series + Exploded lists to rows; index will be duplicated for these rows. + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex + to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s + 0 [1, 2, 3] + 1 foo + 2 [] + 3 [3, 4] + dtype: object + + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + if not len(self) or not is_object_dtype(self): + return self.copy() + + values, counts = reshape.explode(np.asarray(self.array)) + + result = Series(values, index=self.index.repeat(counts), name=self.name) + return result + def unstack(self, level=-1, fill_value=None): """ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py new file mode 100644 index 0000000000000..b4330aadbfba3 --- /dev/null +++ b/pandas/tests/frame/test_explode.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + with pytest.raises(ValueError): + df.explode(list("AA")) + + df.columns = list("AA") + with pytest.raises(ValueError): + df.explode("A") + + +def test_basic(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_rows(): + df = pd.DataFrame( + {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), + ) + + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.MultiIndex.from_tuples( + [ + ("a", 1), + ("a", 1), + ("a", 1), + ("a", 2), + ("b", 1), + ("b", 2), + ("b", 2), + ] + ), + dtype=object, + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_columns(): + df = pd.DataFrame( + {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} + ) + + result = df.explode(("A", 1)) + expected = pd.DataFrame( + { + ("A", 1): pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.Index([0, 0, 0, 1, 2, 3, 3]), + dtype=object, + ), + ("A", 2): 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") + ).set_index("C") + result = df.explode("B") + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode("text") + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py new file mode 100644 index 0000000000000..331546f7dc73d --- /dev/null +++ b/pandas/tests/series/test_explode.py @@ -0,0 +1,113 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + + +def test_basic(): + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_mixed_type(): + s = pd.Series( + [[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo" + ) + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, None, np.nan, "a", "b"], + index=[0, 0, 0, 1, 2, 3, 4, 4], + dtype=object, + name="foo", + ) + tm.assert_series_equal(result, expected) + + +def test_empty(): + s = pd.Series() + result = s.explode() + expected = s.copy() + tm.assert_series_equal(result, expected) + + +def test_nested_lists(): + s = pd.Series([[[1, 2, 3]], [1, 2], 1]) + result = s.explode() + expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2]) + tm.assert_series_equal(result, expected) + + +def test_multi_index(): + s = pd.Series( + [[0, 1, 2], np.nan, [], (3, 4)], + name="foo", + index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]), + ) + result = s.explode() + index = pd.MultiIndex.from_tuples( + [("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)], + names=["foo", "bar"], + ) + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_large(): + s = pd.Series([range(256)]).explode() + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_invert_array(): + df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) + + listify = df.apply(lambda x: x.array, axis=1) + result = listify.explode() + tm.assert_series_equal(result, df["a"].rename()) + + +@pytest.mark.parametrize( + "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))] +) +def non_object_dtype(s): + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_typical_usecase(): + + df = pd.DataFrame( + [{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}], + columns=["var1", "var2"], + ) + exploded = df.var1.str.split(",").explode() + exploded + result = df[["var2"]].join(exploded) + expected = pd.DataFrame( + {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, + columns=["var2", "var1"], + index=[0, 0, 0, 1, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +def test_nested_EA(): + # a nested EA array + s = pd.Series( + [ + pd.date_range("20170101", periods=3, tz="UTC"), + pd.date_range("20170104", periods=3, tz="UTC"), + ] + ) + result = s.explode() + expected = pd.Series( + pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1] + ) + tm.assert_series_equal(result, expected)