From db43e4a0cfd5062121910afc99087dde5ef8dd8f Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Wed, 3 Dec 2014 22:34:20 +0000 Subject: [PATCH 1/4] Add SparseSeries.to_coo and from_coo methods for interaction with scipy.sparse. --- doc/source/api.rst | 8 ++ doc/source/sparse.rst | 90 ++++++++++++++++++- doc/source/whatsnew/v0.16.0.txt | 48 +++++++++++ pandas/sparse/scipy_sparse.py | 102 ++++++++++++++++++++++ pandas/sparse/series.py | 46 +++++++++- pandas/sparse/tests/test_sparse.py | 134 +++++++++++++++++++++++++++-- vb_suite/frame_methods.py | 4 +- vb_suite/sparse.py | 26 ++++++ vb_suite/stat_ops.py | 4 +- 9 files changed, 448 insertions(+), 14 deletions(-) create mode 100644 pandas/sparse/scipy_sparse.py diff --git a/doc/source/api.rst b/doc/source/api.rst index a8097f2648c4b..037ea6c75580e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -620,6 +620,14 @@ Serialization / IO / Conversion Series.to_string Series.to_clipboard +Sparse methods +~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + SparseSeries.to_coo + SparseSeries.from_coo + .. _api.dataframe: DataFrame diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 391aae1cd9105..0809100c2aa86 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -109,10 +109,9 @@ accept scalar values or any 1-dimensional sequence: .. ipython:: python :suppress: - from numpy import nan - .. ipython:: python + from numpy import nan spl.append(np.array([1., nan, nan, 2., 3.])) spl.append(5) spl.append(sparr) @@ -135,3 +134,90 @@ recommend using ``block`` as it's more memory efficient. The ``integer`` format keeps an arrays of all of the locations where the data are not equal to the fill value. The ``block`` format tracks only the locations and sizes of blocks of data. + +.. _sparse.scipysparse: + +Interaction with scipy.sparse +----------------------------- + +Experimental api to transform between sparse pandas and scipy.sparse structures. + +A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. + +The method requires a ``MultiIndex`` with two or more levels. + +.. ipython:: python + :suppress: + + +.. ipython:: python + + from numpy import nan + s = Series([3.0, nan, 1.0, 3.0, nan, nan]) + s.index = MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + + s + # SparseSeries + ss = s.to_sparse() + ss + +In the example below, we transform the ``SparseSeries`` to a sparse representation of a 2-d array by specifying that the first and second ``MultiIndex`` levels define labels for the rows and the third and fourth levels define labels for the columns. We also specify that the column and row labels should be sorted in the final sparse representation. + +.. ipython:: python + + A, il, jl = ss.to_coo(ilevels=['A', 'B'], jlevels=['C', 'D'], + sort_labels=True) + + A + A.todense() + il + jl + +Specifying different row and column labels (and not sorting them) yields a different sparse matrix: + +.. ipython:: python + + A, il, jl = ss.to_coo(ilevels=['A', 'B', 'C'], jlevels=['D'], + sort_labels=False) + + A + A.todense() + il + jl + +A convenience method :meth:`SparseSeries.from_coo` is implemented for creating a ``SparseSeries`` from a ``scipy.sparse.coo_matrix``. + +.. ipython:: python + :suppress: + +.. ipython:: python + + from scipy import sparse + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + A + A.todense() + +The default behaviour (with ``dense_index=False``) simply returns a ``SparseSeries`` containing +only the non-null entries. + +.. ipython:: python + + ss = SparseSeries.from_coo(A) + ss + +Specifying ``dense_index=True`` will result in an index that is the Cartesian product of the +row and columns coordinates of the matrix. Note that this will consume a significant amount of memory +(relative to ``dense_index=False``) if the sparse matrix is large (and sparse) enough. + +.. ipython:: python + + ss_dense = SparseSeries.from_coo(A, dense_index=True) + ss_dense + diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 6082a58687c2c..e9c40ddebbaac 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -110,6 +110,54 @@ Enhancements - Added auto-complete for ``Series.str.``, ``Series.dt.`` and ``Series.cat.`` (:issue:`9322`) +Interaction with scipy.sparse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods + (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` + instances (see :ref:`here `). + For example, given a SparseSeries with MultiIndex we can convert to a + `scipy.sparse.coo_matrix` by specifying the row and column labels as + index levels: + + .. ipython:: python + + from numpy import nan + s = Series([3.0, nan, 1.0, 3.0, nan, nan]) + s.index = MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + + s + # SparseSeries + ss = s.to_sparse() + ss + + A, il, jl = ss.to_coo(ilevels=['A', 'B'], jlevels=['C', 'D'], + sort_labels=False) + + A + A.todense() + il + jl + + The from_coo method is a convenience method for creating a ``SparseSeries`` + from a ``scipy.sparse.coo_matrix``: + + .. ipython:: python + + from scipy import sparse + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + A + A.todense() + + ss = SparseSeries.from_coo(A) + ss + Performance ~~~~~~~~~~~ diff --git a/pandas/sparse/scipy_sparse.py b/pandas/sparse/scipy_sparse.py new file mode 100644 index 0000000000000..418a1edec27d4 --- /dev/null +++ b/pandas/sparse/scipy_sparse.py @@ -0,0 +1,102 @@ +""" +Interaction with scipy.sparse matrices. + +Currently only includes SparseSeries.to_coo helpers. +""" +from pandas.core.frame import DataFrame +from pandas.core.index import MultiIndex, Index +from pandas.core.series import Series +import itertools +import numpy +from pandas.compat import OrderedDict +from pandas.tools.util import cartesian_product + + +def _get_label_to_i_dict(labels, sort_labels=False): + """ Return OrderedDict of unique labels to number. Optionally sort by label. """ + labels = Index(map(tuple, labels)).unique().tolist() # squish + if sort_labels: + labels = sorted(list(labels)) + d = OrderedDict((k, i) for i, k in enumerate(labels)) + return(d) + + +def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): + ilabels = list(zip(*[index.get_level_values(i) for i in subset])) + labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) + return(labels_to_i) + + +def _check_is_partition(parts, whole): + whole = set(whole) + parts = [set(x) for x in parts] + if set.intersection(*parts) != set(): + raise ValueError( + 'Is not a partition because intersection is not null.') + if set.union(*parts) != whole: + raise ValueError('Is not a partition becuase union is not the whole.') + + +def _to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): + """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for + passing to scipy.sparse.coo constructor. """ + # index and column levels must be a partition of the index + _check_is_partition([ilevels, jlevels], range(ss.index.nlevels)) + + # from the SparseSeries: get the labels and data for non-null entries + values = ss._data.values._valid_sp_values + blocs = ss._data.values.sp_index.blocs + blength = ss._data.values.sp_index.blengths + nonnull_labels = list( + itertools.chain(*[ss.index.values[i:(i + j)] for i, j in zip(blocs, blength)])) + + def get_indexers(levels): + """ Return sparse coords and dense labels for subset levels """ + values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels] + labels_to_i = _get_index_subset_to_coord_dict( + ss.index, levels, sort_labels=sort_labels) + i_coord = [labels_to_i[i] for i in values_ilabels] + return(i_coord, list(labels_to_i.keys())) + + i_coord, i_labels = get_indexers(ilevels) + j_coord, j_labels = get_indexers(jlevels) + + return(values, i_coord, j_coord, i_labels, j_labels) + + +def _sparse_series_to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): + """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index levels ilevels, jlevels as the row and column + labels respectively. Returns the sparse_matrix, row and column labels. """ + if ss.index.nlevels < 2: + raise ValueError('to_coo requires MultiIndex with nlevels > 2') + if not ss.index.is_unique: + raise ValueError( + 'Duplicate index entries are not allowed in to_coo transformation.') + + # to keep things simple, only rely on integer indexing (not labels) + ilevels = [ss.index._get_level_number(x) for x in ilevels] + jlevels = [ss.index._get_level_number(x) for x in jlevels] + ss = ss.copy() + ss.index.names = [None] * ss.index.nlevels # kill any existing labels + + v, i, j, il, jl = _to_ijv( + ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels) + import scipy.sparse + sparse_matrix = scipy.sparse.coo_matrix( + (v, (i, j)), shape=(len(il), len(jl))) + return(sparse_matrix, il, jl) + + +def _coo_to_sparse_series(A, dense_index=False): + """ Convert a scipy.sparse.coo_matrix to a SparseSeries. + Use the defaults given in the SparseSeries constructor. """ + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + s = s.sort_index() + s = s.to_sparse() # TODO: specify kind? + if dense_index: + # is there a better constructor method to use here? + i = range(A.shape[0]) + j = range(A.shape[1]) + ind = MultiIndex.from_product([i, j]) + s = s.reindex_axis(ind) + return(s) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 39d286f3744e1..a741da14793a0 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -29,12 +29,14 @@ from pandas.util.decorators import Appender +from pandas.sparse.scipy_sparse import _sparse_series_to_coo, _coo_to_sparse_series + #------------------------------------------------------------------------------ # Wrapper function for Series arithmetic methods def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None, - **eval_kwargs): + **eval_kwargs): """ Wrapper function for Series arithmetic operations, to avoid code duplication. @@ -654,6 +656,48 @@ def combine_first(self, other): dense_combined = self.to_dense().combine_first(other) return dense_combined.to_sparse(fill_value=self.fill_value) + def to_coo(self, ilevels=(0,), jlevels=(1,), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. + + Use ilevels and jlevels to determine the row and column coordinates respectively. + ilevels and jlevels are the names (labels) or numbers of the levels. + {ilevels, jlevels} must be a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + ilevels : tuple/list + jlevels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + il : list (row labels) + jl : list (column labels) + """ + A, il, jl = _sparse_series_to_coo( + self, ilevels, jlevels, sort_labels=sort_labels) + return(A, il, jl) + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a SparseSeries from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : SparseSeries + """ + return(_coo_to_sparse_series(A, dense_index=dense_index)) # overwrite series methods with unaccelerated versions ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False, **ops.series_special_funcs) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 9197a4fc22b9c..3ce12a738dd4f 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -2,6 +2,7 @@ import operator from datetime import datetime +import functools import nose @@ -11,10 +12,10 @@ dec = np.testing.dec from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal, assertRaisesRegexp) + assert_frame_equal, assert_panel_equal, assertRaisesRegexp, assert_array_equal) from numpy.testing import assert_equal -from pandas import Series, DataFrame, bdate_range, Panel +from pandas import Series, DataFrame, bdate_range, Panel, MultiIndex from pandas.core.datetools import BDay from pandas.core.index import Index from pandas.tseries.index import DatetimeIndex @@ -23,6 +24,7 @@ import pandas.util.testing as tm from pandas.compat import range, lrange, StringIO, lrange from pandas import compat +from pandas.tools.util import cartesian_product import pandas.sparse.frame as spf @@ -30,7 +32,6 @@ from pandas.sparse.api import (SparseSeries, SparseTimeSeries, SparseDataFrame, SparsePanel, SparseArray) - import pandas.tests.test_frame as test_frame import pandas.tests.test_panel as test_panel import pandas.tests.test_series as test_series @@ -168,7 +169,7 @@ def test_construct_DataFrame_with_sp_series(self): assert_sp_series_equal(df['col'], self.bseries) - result = df.iloc[:,0] + result = df.iloc[:, 0] assert_sp_series_equal(result, self.bseries) # blocking @@ -743,6 +744,125 @@ def test_combine_first(self): assert_sp_series_equal(result, expected) +class TestSparseSeriesScipyInteraction(tm.TestCase): + # Issue 8048: add SparseSeries coo methods + + def setUp(self): + tm._skip_if_no_scipy() + import scipy.sparse + # SparseSeries inputs used in tests, the tests rely on the order + self.sparse_series = [] + s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) + s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + self.sparse_series.append(s.to_sparse()) + + ss = self.sparse_series[0].copy() + ss.index.names = [3, 0, 1, 2] + self.sparse_series.append(ss) + + ss = pd.Series( + [nan] * 12, index=cartesian_product((range(3), range(4)))).to_sparse() + for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): + ss[k] = v + self.sparse_series.append(ss) + + # results used in tests + self.coo_matrices = [] + self.coo_matrices.append(scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4))) + self.coo_matrices.append(scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4))) + self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)]] + self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)]] + + def test_to_coo_text_names_integer_ilevels_nosort(self): + ss = self.sparse_series[0] + kwargs = {'ilevels': [0, 1], 'jlevels': [2, 3]} + result = (self.coo_matrices[0], self.ils[0], self.jls[0]) + self._run_test(ss, kwargs, result) + + def test_to_coo_text_names_integer_ilevels_sort(self): + ss = self.sparse_series[0] + kwargs = {'ilevels': [0, 1], 'jlevels': [2, 3], 'sort_labels': True} + result = (self.coo_matrices[1], self.ils[1], self.jls[0]) + self._run_test(ss, kwargs, result) + + def test_to_coo_integer_names_integer_ilevels_nosort(self): + ss = self.sparse_series[1] + kwargs = {'ilevels': [3, 0], 'jlevels': [1, 2]} + result = (self.coo_matrices[0], self.ils[0], self.jls[0]) + self._run_test(ss, kwargs, result) + + def test_to_coo_text_names_text_ilevels_nosort(self): + ss = self.sparse_series[0] + kwargs = {'ilevels': ['A', 'B'], 'jlevels': ['C', 'D']} + result = (self.coo_matrices[0], self.ils[0], self.jls[0]) + self._run_test(ss, kwargs, result) + + def test_to_coo_bad_partition_nonnull_intersection(self): + ss = self.sparse_series[0] + self.assertRaises(ValueError, ss.to_coo, ['A', 'B', 'C'], ['C', 'D']) + + def test_to_coo_bad_partition_small_union(self): + ss = self.sparse_series[0] + self.assertRaises(ValueError, ss.to_coo, ['A'], ['C', 'D']) + + def test_to_coo_nlevels_less_than_two(self): + ss = self.sparse_series[0] + ss.index = np.arange(len(ss.index)) + self.assertRaises(ValueError, ss.to_coo) + + def test_to_coo_bad_ilevel(self): + ss = self.sparse_series[0] + self.assertRaises(KeyError, ss.to_coo, ['A', 'B'], ['C', 'D', 'E']) + + def test_to_coo_duplicate_index_entries(self): + ss = pd.concat( + [self.sparse_series[0], self.sparse_series[0]]).to_sparse() + self.assertRaises(ValueError, ss.to_coo, ['A', 'B'], ['C', 'D']) + + def test_from_coo_dense_index(self): + ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True) + check = self.sparse_series[2] + assert_sp_series_equal(ss, check) + + def test_from_coo_nodense_index(self): + ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=False) + check = self.sparse_series[2] + check = check.dropna().to_sparse() + assert_sp_series_equal(ss, check) + + def _run_test(self, ss, kwargs, check): + results = ss.to_coo(**kwargs) + self._check_results_to_coo(results, check) + # for every test, also test symmetry property (transpose), switch + # ilevels and jlevels + d = kwargs.copy() + d['ilevels'] = kwargs['jlevels'] + d['jlevels'] = kwargs['ilevels'] + results = ss.to_coo(**d) + results = (results[0].T, results[2], results[1]) + self._check_results_to_coo(results, check) + + @staticmethod + def _check_results_to_coo(results, check): + (A, il, jl) = results + (A_result, il_result, jl_result) = check + # convert to dense and compare + assert_array_equal(A.todense(), A_result.todense()) + # or compare directly as difference of sparse + # assert(abs(A - A_result).max() < 1e-12) # max is failing in python + # 2.6 + assert_equal(il, il_result) + assert_equal(jl, jl_result) + + class TestSparseTimeSeries(tm.TestCase): pass @@ -882,9 +1002,9 @@ def test_constructor_from_series(self): # GH 2873 x = Series(np.random.randn(10000), name='a') x = x.to_sparse(fill_value=0) - tm.assert_isinstance(x,SparseSeries) + tm.assert_isinstance(x, SparseSeries) df = SparseDataFrame(x) - tm.assert_isinstance(df,SparseDataFrame) + tm.assert_isinstance(df, SparseDataFrame) x = Series(np.random.randn(10000), name='a') y = Series(np.random.randn(10000), name='b') @@ -1084,7 +1204,7 @@ def test_icol(self): data = {'A': [0, 1]} iframe = SparseDataFrame(data, default_kind='integer') self.assertEqual(type(iframe['A'].sp_index), - type(iframe.icol(0).sp_index)) + type(iframe.icol(0).sp_index)) def test_set_value(self): diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 334534ed466f2..77e9044e2a40c 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -500,9 +500,9 @@ def get_data(n=100000): frame_from_records_generator = Benchmark('df = DataFrame.from_records(get_data())', setup, name='frame_from_records_generator', - start_date=datetime(2013,10,04)) # issue-4911 + start_date=datetime(2013,10,4)) # issue-4911 frame_from_records_generator_nrows = Benchmark('df = DataFrame.from_records(get_data(), nrows=1000)', setup, name='frame_from_records_generator_nrows', - start_date=datetime(2013,10,04)) # issue-4911 + start_date=datetime(2013,10,4)) # issue-4911 diff --git a/vb_suite/sparse.py b/vb_suite/sparse.py index 1cb0f9233f7e9..99e7150ebc272 100644 --- a/vb_suite/sparse.py +++ b/vb_suite/sparse.py @@ -37,3 +37,29 @@ sparse_constructor = Benchmark(stmt, setup, name="sparse_frame_constructor", start_date=datetime(2012, 6, 1)) + + +setup = common_setup + """ +s = pd.Series([nan] * 10000) +s[0] = 3.0 +s[100] = -1.0 +s[999] = 12.1 +s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) +ss = s.to_sparse() +""" + +stmt = "ss.to_coo(ilevels=[0, 1], jlevels=[2, 3], sort_labels=True)" + +sparse_series_to_coo = Benchmark(stmt, setup, name="sparse_series_to_coo", + start_date=datetime(2015, 1, 3)) + +setup = common_setup + """ +import scipy.sparse +import pandas.sparse.series +A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) +""" + +stmt = "ss = pandas.sparse.series.from_coo(A)" + +sparse_series_from_coo = Benchmark(stmt, setup, name="sparse_series_from_coo", + start_date=datetime(2015, 1, 3)) diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py index f4ea6706c193c..544ad6d00ed37 100644 --- a/vb_suite/stat_ops.py +++ b/vb_suite/stat_ops.py @@ -86,9 +86,9 @@ start_date=datetime(2011, 12, 12)) stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup, - start_date=datetime(2014, 01, 16)) + start_date=datetime(2014, 1, 16)) stats_rank_pct_average_old = Benchmark('s.rank() / len(s)', setup, - start_date=datetime(2014, 01, 16)) + start_date=datetime(2014, 1, 16)) setup = common_setup + """ values = np.random.randint(0, 100000, size=200000) s = Series(values) From ad117d2d9059baaddbab6f0c08355eafa59703e1 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Tue, 3 Feb 2015 21:44:07 +0000 Subject: [PATCH 2/4] Add blank line --- pandas/sparse/series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a741da14793a0..7959f2d56db17 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -698,6 +698,7 @@ def from_coo(cls, A, dense_index=False): s : SparseSeries """ return(_coo_to_sparse_series(A, dense_index=dense_index)) + # overwrite series methods with unaccelerated versions ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False, **ops.series_special_funcs) From 21e2cbc1cdb1e6378af6e506db0020faa72b07f6 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Wed, 4 Feb 2015 21:16:52 +0000 Subject: [PATCH 3/4] split long line --- pandas/sparse/scipy_sparse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/sparse/scipy_sparse.py b/pandas/sparse/scipy_sparse.py index 418a1edec27d4..6d634a6353205 100644 --- a/pandas/sparse/scipy_sparse.py +++ b/pandas/sparse/scipy_sparse.py @@ -38,7 +38,8 @@ def _check_is_partition(parts, whole): def _to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): - """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for + """ For arbitrary (MultiIndexed) SparseSeries return + (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor. """ # index and column levels must be a partition of the index _check_is_partition([ilevels, jlevels], range(ss.index.nlevels)) From 9120f71faaa593c5854f6aa1611a73e10b5016d8 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Tue, 10 Feb 2015 22:00:35 +0000 Subject: [PATCH 4/4] Fix bug in multiindex series groupby where sort argument is ignored (issue 9444). One test added. --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/groupby.py | 2 +- pandas/tests/test_groupby.py | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index e9c40ddebbaac..dad9a2fb30536 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -258,3 +258,4 @@ Bug Fixes - Fixes issue with ``index_col=False`` when ``usecols`` is also specified in ``read_csv``. (:issue:`9082`) - Bug where ``wide_to_long`` would modify the input stubnames list (:issue:`9204`) - Bug in to_sql not storing float64 values using double precision. (:issue:`9009`) +- Fixed bug in ``Series.groupby`` where grouping on ``MultiIndex`` levels would ignore the sort argument (:issue:`9444`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cb5dedc887bca..06fbb55bcabd6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1378,7 +1378,7 @@ def _get_compressed_labels(self): else: if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape) - comp_ids, obs_group_ids = _compress_group_index(group_index) + comp_ids, obs_group_ids = _compress_group_index(group_index, sort=self.sort) else: ping = self.groupings[0] comp_ids = ping.labels diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4077f468d8b1f..6f863a49f4ed3 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -107,6 +107,15 @@ def setUp(self): 'E': np.random.randn(11), 'F': np.random.randn(11)}) + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=['a', 'b']) + self.mseries = Series([0, 1, 2, 3, 4, 5], index=index) + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 1], [1, 0, 0]], + names=['a', 'b']) + self.mseries_result = Series([0, 2, 4], index=index) + def test_basic(self): def checkit(dtype): @@ -4889,6 +4898,14 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) + def test_groupby_sort_multiindex_series(self): + # series multiindex groupby sort argument was not being passed through _compress_group_index + # GH 9444 + result = self.mseries.groupby(level=['a', 'b'], sort=False).first() + assert_series_equal(result, self.mseries_result) + result = self.mseries.groupby(level=['a', 'b'], sort=True).first() + assert_series_equal(result, self.mseries_result.sort_index()) + def test_groupby_categorical_two_columns(self): # https://github.com/pydata/pandas/issues/8138