Move dtype handling to algorithm

sinhrks · sinhrks · commit 2392e7c4573b · 2016-04-29T07:14:53.000+09:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -10,6 +10,7 @@
 import pandas.core.common as com
 import pandas.algos as algos
 import pandas.hashtable as htable
+from pandas.types import api as gt
 from pandas.compat import string_types
 from pandas.tslib import iNaT
 
@@ -253,27 +254,28 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     """
     from pandas.core.series import Series
-    from pandas.tools.tile import cut
-
     name = getattr(values, 'name', None)
-    values = Series(values).values
 
     if bins is not None:
         try:
+            from pandas.tools.tile import cut
+            values = Series(values).values
             cat, bins = cut(values, bins, retbins=True)
         except TypeError:
             raise TypeError("bins argument only works with numeric data.")
         values = cat.codes
 
-    if com.is_extension_type(values):
-        result = values.value_counts(dropna=dropna)
+    if com.is_extension_type(values) and not com.is_datetimetz(values):
+        # handle Categorical and sparse,
+        # datetime tz can be handeled in ndarray path
+        result = Series(values).values.value_counts(dropna=dropna)
         result.name = name
         counts = result.values
     else:
-        # ndarray path
+        # ndarray path. pass original to handle DatetimeTzBlock
         keys, counts = _value_counts_arraylike(values, dropna=dropna)
 
-        from pandas import Index
+        from pandas import Index, Series
         if not isinstance(keys, Index):
             keys = Index(keys)
         result = Series(counts, index=keys, name=name)
@@ -294,20 +296,23 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
 
 def _value_counts_arraylike(values, dropna=True):
-    from pandas import PeriodIndex, DatetimeIndex
+    is_datetimetz = com.is_datetimetz(values)
+    is_period = (isinstance(values, gt.ABCPeriodIndex) or
+                 com.is_period_arraylike(values))
+
+    orig = values
 
+    from pandas.core.series import Series
+    values = Series(values).values
     dtype = values.dtype
-    is_period = com.is_period_arraylike(values)
-    is_datetimetz = com.is_datetimetz(values)
 
-    if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
-            is_datetimetz:
+    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
+        from pandas.tseries.index import DatetimeIndex
+        from pandas.tseries.period import PeriodIndex
 
         if is_period:
             values = PeriodIndex(values)
-        elif is_datetimetz:
-            tz = getattr(values, 'tz', None)
-            values = DatetimeIndex(values).tz_localize(None)
+            freq = values.freq
 
         values = values.view(np.int64)
         keys, counts = htable.value_count_scalar64(values, dropna)
@@ -316,21 +321,25 @@ def _value_counts_arraylike(values, dropna=True):
             msk = keys != iNaT
             keys, counts = keys[msk], counts[msk]
 
-        # localize to the original tz if necessary
-        if is_datetimetz:
-            keys = DatetimeIndex(keys).tz_localize(tz)
-
         # convert the keys back to the dtype we came in
-        else:
-            keys = keys.astype(dtype)
+        keys = keys.astype(dtype)
+
+        # dtype handling
+        if is_datetimetz:
+            if isinstance(orig, gt.ABCDatetimeIndex):
+                tz = orig.tz
+            else:
+                tz = orig.dt.tz
+            keys = DatetimeIndex._simple_new(keys, tz=tz)
+        if is_period:
+            keys = PeriodIndex._simple_new(keys, freq=freq)
 
     elif com.is_integer_dtype(dtype):
         values = com._ensure_int64(values)
         keys, counts = htable.value_count_scalar64(values, dropna)
     elif com.is_float_dtype(dtype):
         values = com._ensure_float64(values)
         keys, counts = htable.value_count_scalar64(values, dropna)
-
     else:
         values = com._ensure_object(values)
         mask = com.isnull(values)
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -943,18 +943,6 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
         from pandas.core.algorithms import value_counts
         result = value_counts(self, sort=sort, ascending=ascending,
                               normalize=normalize, bins=bins, dropna=dropna)
-
-        if isinstance(self, gt.ABCPeriodIndex):
-            # preserve freq
-            result.index = self._simple_new(result.index.values,
-                                            freq=self.freq)
-        elif com.is_datetimetz(self):
-            if isinstance(self, gt.ABCDatetimeIndex):
-                tz = self.tz
-            else:
-                tz = self.dt.tz
-            result.index = result.index._simple_new(result.index.values,
-                                                    tz=tz)
         return result
 
     def unique(self):
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1728,6 +1728,29 @@ def test_value_counts_datetime_tz(self):
         tm.assert_series_equal(s.value_counts(normalize=True), exp)
         tm.assert_series_equal(idx.value_counts(normalize=True), exp)
 
+    def test_value_counts_period(self):
+        values = [pd.Period('2011-01', freq='M'),
+                  pd.Period('2011-02', freq='M'),
+                  pd.Period('2011-03', freq='M'),
+                  pd.Period('2011-01', freq='M'),
+                  pd.Period('2011-01', freq='M'),
+                  pd.Period('2011-03', freq='M')]
+
+        exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M')
+        exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
+
+        s = pd.Series(values, name='xxx')
+        tm.assert_series_equal(s.value_counts(), exp)
+        # check DatetimeIndex outputs the same result
+        idx = pd.PeriodIndex(values, name='xxx')
+        tm.assert_series_equal(idx.value_counts(), exp)
+
+        # normalize
+        exp = pd.Series(np.array([3., 2., 1]) / 6.,
+                        index=exp_idx, name='xxx')
+        tm.assert_series_equal(s.value_counts(normalize=True), exp)
+        tm.assert_series_equal(idx.value_counts(normalize=True), exp)
+
     def test_value_counts_categorical_ordered(self):
         # most dtypes are tested in test_base.py
         values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True)