From 36ec9a1c301f5bd16d5094d151d96959cc0a0462 Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 19 Oct 2016 14:00:21 -0400 Subject: [PATCH 001/183] Update tile.py --- pandas/tools/tile.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 62bbfc2f630a5..0cd23af4c9395 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -119,7 +119,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=include_lowest) -def qcut(x, q, labels=None, retbins=False, precision=3): +def qcut(x, q, labels=None, retbins=False, precision=3, + duplicate_edges='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -141,6 +142,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3): as a scalar. precision : int The precision at which to store and display the bins labels + duplicate_edges : {'raise', 'drop'}, optional + If binned edges are not unique, raise ValueError or drop non- + uniques. Returns ------- @@ -172,11 +176,13 @@ def qcut(x, q, labels=None, retbins=False, precision=3): quantiles = q bins = algos.quantile(x, quantiles) return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + precision=precision, include_lowest=True, + duplicate_edges='raise') def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, - precision=3, name=None, include_lowest=False): + precision=3, name=None, include_lowest=False, + duplicate_edges='raise'): x_is_series = isinstance(x, Series) series_index = None @@ -190,8 +196,13 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) + if len(algos.unique(bins)) < len(bins): - raise ValueError('Bin edges must be unique: %s' % repr(bins)) + if (duplicate_edges == 'raise'): + raise ValueError('Bin edges must be unique: %s' + % repr(bins)) + else: + bins = algos.unique(bins) if include_lowest: ids[x == bins[0]] = 1 From 445088448f1f33ec068839575a17c9bbc1ebdd8e Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 19 Oct 2016 17:54:45 -0400 Subject: [PATCH 002/183] Update tile.py --- pandas/tools/tile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 0cd23af4c9395..9415129947161 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -119,7 +119,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=include_lowest) -def qcut(x, q, labels=None, retbins=False, precision=3, +def qcut(x, q, labels=None, retbins=False, precision=3, duplicate_edges='raise'): """ Quantile-based discretization function. Discretize variable into @@ -196,10 +196,9 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) - if len(algos.unique(bins)) < len(bins): if (duplicate_edges == 'raise'): - raise ValueError('Bin edges must be unique: %s' + raise ValueError('Bin edges must be unique: %s' % repr(bins)) else: bins = algos.unique(bins) From 0b6946bbe8f4c946bbde9ddf6147ce1654c12d33 Mon Sep 17 00:00:00 2001 From: dubourg Date: Thu, 20 Oct 2016 12:25:34 +0200 Subject: [PATCH 003/183] Type codes and categories as lists instead of tuples in _factorize_from_iterables (fixes #14438) (#14449) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/categorical.py | 10 +++++----- pandas/tests/frame/test_combine_concat.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 5180b9a092f6c..f9f7f0847c650 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -45,6 +45,7 @@ Bug Fixes - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) +- Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 9efaff6060909..fd1a23a5bab7f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2055,14 +2055,14 @@ def _factorize_from_iterables(iterables): Returns ------- - codes_tuple : tuple of ndarrays - categories_tuple : tuple of Indexes + codes_list : list of ndarrays + categories_list : list of Indexes Notes ----- See `_factorize_from_iterable` for more info. """ if len(iterables) == 0: - # For consistency, it should return a list of 2 tuples. - return [(), ()] - return lzip(*[_factorize_from_iterable(it) for it in iterables]) + # For consistency, it should return a list of 2 lists. + return [[], []] + return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 81aa694577fb5..5b5236843643d 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -57,6 +57,24 @@ def test_concat_multiple_tzs(self): expected = DataFrame(dict(time=[ts2, ts3])) assert_frame_equal(results, expected) + def test_concat_tuple_keys(self): + # GH 14438 + df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB')) + df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB')) + results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')]) + expected = pd.DataFrame( + {'A': {('bee', 'bah', 0): 1.0, + ('bee', 'bah', 1): 1.0, + ('bee', 'boo', 0): 2.0, + ('bee', 'boo', 1): 2.0, + ('bee', 'boo', 2): 2.0}, + 'B': {('bee', 'bah', 0): 1.0, + ('bee', 'bah', 1): 1.0, + ('bee', 'boo', 0): 2.0, + ('bee', 'boo', 1): 2.0, + ('bee', 'boo', 2): 2.0}}) + assert_frame_equal(results, expected) + def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) From 2d3a739b448f0368cba75508c19573ca169a16b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Vall=C3=A9s=20P=C3=A9rez?= Date: Thu, 20 Oct 2016 06:28:21 -0400 Subject: [PATCH 004/183] ERR: Checks for left_index and right_index merge parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Iván Vallés Pérez Closes #14434 from ivallesp/add-check-for-merge-indices and squashes the following commits: e18b7c9 [Iván Vallés Pérez] Add some checks for assuring that the left_index and right_index parameters have correct types. Tests added. --- doc/source/whatsnew/v0.19.1.txt | 4 ++++ pandas/tools/merge.py | 9 +++++++++ pandas/tools/tests/test_merge.py | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index f9f7f0847c650..292d9698eefe3 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -26,6 +26,7 @@ Performance Improvements + .. _whatsnew_0191.bug_fixes: Bug Fixes @@ -42,6 +43,9 @@ Bug Fixes +- ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) + + - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ce7f8908d7506..86e2e8aabbee1 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -472,6 +472,15 @@ def __init__(self, left, right, how='inner', on=None, 'can not merge DataFrame with instance of ' 'type {0}'.format(type(right))) + if not is_bool(left_index): + raise ValueError( + 'left_index parameter must be of type bool, not ' + '{0}'.format(type(left_index))) + if not is_bool(right_index): + raise ValueError( + 'right_index parameter must be of type bool, not ' + '{0}'.format(type(right_index))) + # warn user when merging between different levels if left.columns.nlevels != right.columns.nlevels: msg = ('merging between different levels can give an unintended ' diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6e36100ddd0b4..f078959608f91 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -109,6 +109,15 @@ def test_merge_misspecified(self): self.assertRaises(ValueError, merge, self.df, self.df2, left_on=['key1'], right_on=['key1', 'key2']) + def test_index_and_on_parameters_confusion(self): + self.assertRaises(ValueError, merge, self.df, self.df2, how='left', + left_index=False, right_index=['key1', 'key2']) + self.assertRaises(ValueError, merge, self.df, self.df2, how='left', + left_index=['key1', 'key2'], right_index=False) + self.assertRaises(ValueError, merge, self.df, self.df2, how='left', + left_index=['key1', 'key2'], + right_index=['key1', 'key2']) + def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() From 921ce47c1a3da6b24965890c622aed61423d3bf7 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 20 Oct 2016 06:58:29 -0400 Subject: [PATCH 005/183] BUG: pivot_table may raise TypeError without values ``pivot_table`` raises TypeError`` when ``index`` or ``columns`` is array-like and ``values`` is not specified. Author: sinhrks Closes #14380 from sinhrks/pivot_table_bug and squashes the following commits: be426db [sinhrks] BUG: pivot_table may raise TypeError without values --- doc/source/whatsnew/v0.19.1.txt | 3 +++ pandas/tools/pivot.py | 12 +++++++++--- pandas/tools/tests/test_pivot.py | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 292d9698eefe3..30593c1b204e7 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -53,3 +53,6 @@ Bug Fixes - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) + +- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` + is not scalar and ``values`` is not specified (:issue:`14380`) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 94b464f6fca6c..9e064a1d1fc99 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -101,10 +101,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', else: values_multi = False values = [values] - else: - values = list(data.columns.drop(keys)) - if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): @@ -117,6 +114,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if len(to_filter) < len(data.columns): data = data[to_filter] + else: + values = data.columns + for key in keys: + try: + values = values.drop(key) + except (TypeError, ValueError): + pass + values = list(values) + grouped = data.groupby(keys) agged = grouped.agg(aggfunc) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 75c6db23b4bc7..5944fa1b34611 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -131,6 +131,39 @@ def test_pivot_dtypes(self): expected = Series(dict(float64=2)) tm.assert_series_equal(result, expected) + def test_pivot_no_values(self): + # GH 14380 + idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-01-02', + '2011-01-01', '2011-01-02']) + df = pd.DataFrame({'A': [1, 2, 3, 4, 5]}, + index=idx) + res = df.pivot_table(index=df.index.month, columns=df.index.day) + + exp_columns = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)]) + exp = pd.DataFrame([[2.5, 4.0], [2.0, np.nan]], + index=[1, 2], columns=exp_columns) + tm.assert_frame_equal(res, exp) + + df = pd.DataFrame({'A': [1, 2, 3, 4, 5], + 'dt': pd.date_range('2011-01-01', freq='D', + periods=5)}, + index=idx) + res = df.pivot_table(index=df.index.month, + columns=pd.Grouper(key='dt', freq='M')) + exp_columns = pd.MultiIndex.from_tuples([('A', + pd.Timestamp('2011-01-31'))]) + exp_columns.names = [None, 'dt'] + exp = pd.DataFrame([3.25, 2.0], + index=[1, 2], columns=exp_columns) + tm.assert_frame_equal(res, exp) + + res = df.pivot_table(index=pd.Grouper(freq='A'), + columns=pd.Grouper(key='dt', freq='M')) + exp = pd.DataFrame([3], + index=pd.DatetimeIndex(['2011-12-31']), + columns=exp_columns) + tm.assert_frame_equal(res, exp) + def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], index='A', columns=['B', 'C'], fill_value=0) From 65362aa4f06f01efdc20ca487c1c3c1f090613ee Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Thu, 20 Oct 2016 08:46:01 -0500 Subject: [PATCH 006/183] BUG: underflow on Timestamp creation (#14433) * BUG: underflow on Timestamp creation * undo change to lower bound * change lower bound; but keep rounding to us --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/lib.pyx | 9 ++------- pandas/src/datetime/np_datetime.c | 21 ++++++++++++++------- pandas/src/inference.pyx | 16 +++------------- pandas/src/util.pxd | 14 ++++++++++++++ pandas/tseries/tests/test_timeseries.py | 9 +++++++++ pandas/tslib.pyx | 13 ++++++++----- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 30593c1b204e7..b2facd4e2d0ec 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -46,6 +46,7 @@ Bug Fixes - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) +- Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index b56a02b245d69..ef3407ffd5388 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -65,13 +65,8 @@ cdef int64_t NPY_NAT = util.get_nat() ctypedef unsigned char UChar cimport util -from util cimport is_array, _checknull, _checknan - -cdef extern from "headers/stdint.h": - enum: UINT8_MAX - enum: INT64_MAX - enum: INT64_MIN - +from util cimport (is_array, _checknull, _checknan, INT64_MAX, + INT64_MIN, UINT8_MAX) cdef extern from "math.h": double sqrt(double x) diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index 80703c8b08de6..d4b9de45618f3 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -846,7 +846,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt; @@ -860,7 +861,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / 60; @@ -875,7 +877,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / (60*60); @@ -891,7 +894,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / (60*60*1000LL); @@ -908,7 +912,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / (60*60*1000000LL); @@ -925,7 +930,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / (60*60*1000000000LL); @@ -943,7 +949,8 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, dt = dt % perday; } else { - set_datetimestruct_days((dt - (perday-1)) / perday, out); + set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), + out); dt = (perday-1) + (dt + 1) % perday; } out->hour = dt / (60*60*1000000000000LL); diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 4fa730eac0fd1..5ac2c70bb1808 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,19 +6,9 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 -cdef extern from "headers/stdint.h": - enum: UINT8_MAX - enum: UINT16_MAX - enum: UINT32_MAX - enum: UINT64_MAX - enum: INT8_MIN - enum: INT8_MAX - enum: INT16_MIN - enum: INT16_MAX - enum: INT32_MAX - enum: INT32_MIN - enum: INT64_MAX - enum: INT64_MIN +from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, + INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, + INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN) # core.common import for fast inference checks diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index fcb5583a0a6e7..fdbfbf62af7d2 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -38,6 +38,20 @@ ctypedef fused numeric: cnp.float32_t cnp.float64_t +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + cdef inline object get_value_at(ndarray arr, object loc): cdef: Py_ssize_t i, sz diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f640b3974b360..c13805d383e5d 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4463,6 +4463,15 @@ def test_basics_nanos(self): self.assertEqual(stamp.microsecond, 0) self.assertEqual(stamp.nanosecond, 500) + # GH 14415 + val = np.iinfo(np.int64).min + 80000000000000 + stamp = Timestamp(val) + self.assertEqual(stamp.year, 1677) + self.assertEqual(stamp.month, 9) + self.assertEqual(stamp.day, 21) + self.assertEqual(stamp.microsecond, 145224) + self.assertEqual(stamp.nanosecond, 192) + def test_unit(self): def check(val, unit=None, h=1, s=1, us=0): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index bab45595cd60f..81e721e610cc6 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -24,6 +24,7 @@ from cpython cimport ( PyUnicode_AsUTF8String, ) + # Cython < 0.17 doesn't have this in cpython cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) @@ -37,7 +38,7 @@ from datetime cimport cmp_pandas_datetimestruct from libc.stdlib cimport free from util cimport (is_integer_object, is_float_object, is_datetime64_object, - is_timedelta64_object) + is_timedelta64_object, INT64_MAX) cimport util from datetime cimport * @@ -904,10 +905,12 @@ cpdef object get_value_box(ndarray arr, object loc): # Add the min and max fields at the class level -# These are defined as magic numbers due to strange -# wraparound behavior when using the true int64 lower boundary -cdef int64_t _NS_LOWER_BOUND = -9223285636854775000LL -cdef int64_t _NS_UPPER_BOUND = 9223372036854775807LL +cdef int64_t _NS_UPPER_BOUND = INT64_MAX +# the smallest value we could actually represent is +# INT64_MIN + 1 == -9223372036854775807 +# but to allow overflow free conversion with a microsecond resolution +# use the smallest value with a 0 nanosecond unit (0s in last 3 digits) +cdef int64_t _NS_LOWER_BOUND = -9223372036854775000 cdef pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS pandas_datetime_to_datetimestruct(_NS_LOWER_BOUND, PANDAS_FR_ns, &_NS_MIN_DTS) From 794f79295484298525dbc8dd3b8ab251ad065e61 Mon Sep 17 00:00:00 2001 From: Michael Felt Date: Fri, 21 Oct 2016 00:44:39 +0200 Subject: [PATCH 007/183] Update unpack_template.h (#14441) USE_CASE_RANGE is a GNU C feature. This change will activate USE_CASE_RANGE on any platform when using GNU C and not on any platform when a different compiler is being used. closes #14373 --- pandas/src/msgpack/unpack_template.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/msgpack/unpack_template.h b/pandas/src/msgpack/unpack_template.h index 95af6735520fc..fba372ddcb3e4 100644 --- a/pandas/src/msgpack/unpack_template.h +++ b/pandas/src/msgpack/unpack_template.h @@ -17,7 +17,7 @@ */ #ifndef USE_CASE_RANGE -#if !defined(_MSC_VER) +#ifdef __GNUC__ #define USE_CASE_RANGE #endif #endif From 170d13ae3b41cb4793554237fdf3ffc0d60aec1d Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Fri, 21 Oct 2016 19:37:02 -0500 Subject: [PATCH 008/183] DOC: update readme for repo move (#14470) --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6ebc287fa2cf6..4293d7294d5e0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
-
+
----------------- @@ -25,8 +25,8 @@ Build Status - - travis build status + + travis build status @@ -39,7 +39,7 @@ Coverage - coverage + coverage Conda @@ -127,7 +127,7 @@ Here are just a few of the things that pandas does well: ## Where to get it The source code is currently hosted on GitHub at: -http://github.com/pydata/pandas +http://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python package index](http://pypi.python.org/pypi/pandas/) and on conda. From 83a380c95f89542a74b37f3cbd116f0e368c0d12 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 22 Oct 2016 05:49:40 -0400 Subject: [PATCH 009/183] BUG: Catch overflow in both directions for checked add (#14453) 1) Add checks to ensure that add overflow does not occur both in the positive or negative directions. 2) Add benchmarks to ensure that operations involving this checked add function are significantly impacted. --- asv_bench/benchmarks/algorithms.py | 26 +++++++++++++++++ asv_bench/benchmarks/timedelta.py | 13 ++++++++- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/nanops.py | 37 ++++++++++++++++++++++--- pandas/tests/test_nanops.py | 9 +++++- pandas/tseries/tests/test_timedeltas.py | 2 ++ 6 files changed, 82 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 6eac7b4831f0f..9807639143ddb 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -15,6 +15,14 @@ def setup(self): self.int = pd.Int64Index(np.arange(N).repeat(5)) self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + # Convenience naming. + self.checked_add = pd.core.nanops._checked_add_with_arr + + self.arr = np.arange(1000000) + self.arrpos = np.arange(1000000) + self.arrneg = np.arange(-1000000, 0) + self.arrmixed = np.array([1, -1]).repeat(500000) + def time_int_factorize(self): self.int.factorize() @@ -29,3 +37,21 @@ def time_int_duplicated(self): def time_float_duplicated(self): self.float.duplicated() + + def time_add_overflow_pos_scalar(self): + self.checked_add(self.arr, 1) + + def time_add_overflow_neg_scalar(self): + self.checked_add(self.arr, -1) + + def time_add_overflow_zero_scalar(self): + self.checked_add(self.arr, 0) + + def time_add_overflow_pos_arr(self): + self.checked_add(self.arr, self.arrpos) + + def time_add_overflow_neg_arr(self): + self.checked_add(self.arr, self.arrneg) + + def time_add_overflow_mixed_arr(self): + self.checked_add(self.arr, self.arrmixed) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 9719fd87dfb2e..8470525dd01fa 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas import to_timedelta +from pandas import to_timedelta, Timestamp class timedelta_convert_int(object): @@ -47,3 +47,14 @@ def time_timedelta_convert_coerce(self): def time_timedelta_convert_ignore(self): to_timedelta(self.arr, errors='ignore') + + +class timedelta_add_overflow(object): + goal_time = 0.2 + + def setup(self): + self.td = to_timedelta(np.arange(1000000)) + self.ts = Timestamp('2000') + + def test_add_td_ts(self): + self.td + self.ts diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index b2facd4e2d0ec..4b2aae2277079 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) +- Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 564586eec5a8e..d7d68ad536be5 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,6 +11,7 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib +from pandas.compat.numpy import _np_version_under1p10 from pandas.types.common import (_ensure_int64, _ensure_object, _ensure_float64, _get_dtype, is_float, is_scalar, @@ -829,9 +830,37 @@ def _checked_add_with_arr(arr, b): Raises ------ - OverflowError if any x + y exceeds the maximum int64 value. + OverflowError if any x + y exceeds the maximum or minimum int64 value. """ - if (np.iinfo(np.int64).max - b < arr).any(): - raise OverflowError("Python int too large to " - "convert to C long") + # For performance reasons, we broadcast 'b' to the new array 'b2' + # so that it has the same size as 'arr'. + if _np_version_under1p10: + if lib.isscalar(b): + b2 = np.empty(arr.shape) + b2.fill(b) + else: + b2 = b + else: + b2 = np.broadcast_to(b, arr.shape) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = (np.iinfo(np.int64).min - b2 > arr).any() + elif not mask2.any(): + to_raise = (np.iinfo(np.int64).max - b2 < arr).any() + else: + to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) + + if to_raise: + raise OverflowError("Overflow in int64 addition") return arr + b diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f00fdd196abea..be634228b1b6e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1004,13 +1004,20 @@ def prng(self): def test_int64_add_overflow(): # see gh-14068 - msg = "too (big|large) to convert" + msg = "Overflow in int64 addition" m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min with tm.assertRaisesRegexp(OverflowError, msg): nanops._checked_add_with_arr(np.array([m, m]), m) with tm.assertRaisesRegexp(OverflowError, msg): nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([n, n]), n) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) with tm.assertRaisesRegexp(OverflowError, msg): with tm.assert_produces_warning(RuntimeWarning): nanops._checked_add_with_arr(np.array([m, m]), diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 38e210d698035..f0d14014d6559 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1957,6 +1957,8 @@ def test_add_overflow(self): to_timedelta(106580, 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta(106580, 'D') + + msg = "Overflow in int64 addition" with tm.assertRaisesRegexp(OverflowError, msg): to_timedelta([106580], 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg): From aff20eb2d046f0ac5768912ac7e813c21039b1fe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 22 Oct 2016 11:50:20 +0200 Subject: [PATCH 010/183] DOC: correct DataFrame.pivot docstring (#14430) The mention of panels that are created is not correct. You get a multi-index --- pandas/core/frame.py | 48 +++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1798a35168265..dfe7e90c134fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3868,9 +3868,8 @@ def last_valid_index(self): def pivot(self, index=None, columns=None, values=None): """ Reshape data (produce a "pivot" table) based on column values. Uses - unique values from index / columns to form axes and return either - DataFrame or Panel, depending on whether you request a single value - column (DataFrame) or all columns (Panel) + unique values from index / columns to form axes of the resulting + DataFrame. Parameters ---------- @@ -3880,7 +3879,20 @@ def pivot(self, index=None, columns=None, values=None): columns : string or object Column name to use to make new frame's columns values : string or object, optional - Column name to use for populating new frame's values + Column name to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns + + Returns + ------- + pivoted : DataFrame + + See also + -------- + DataFrame.pivot_table : generalization of pivot that can handle + duplicate values for one index/column pair + DataFrame.unstack : pivot based on the index values instead of a + column Notes ----- @@ -3889,30 +3901,30 @@ def pivot(self, index=None, columns=None, values=None): Examples -------- + + >>> df = pd.DataFrame({'foo': ['one','one','one','two','two','two'], + 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], + 'baz': [1, 2, 3, 4, 5, 6]}) >>> df foo bar baz - 0 one A 1. - 1 one B 2. - 2 one C 3. - 3 two A 4. - 4 two B 5. - 5 two C 6. - - >>> df.pivot('foo', 'bar', 'baz') + 0 one A 1 + 1 one B 2 + 2 one C 3 + 3 two A 4 + 4 two B 5 + 5 two C 6 + + >>> df.pivot(index='foo', columns='bar', values='baz') A B C one 1 2 3 two 4 5 6 - >>> df.pivot('foo', 'bar')['baz'] + >>> df.pivot(index='foo', columns='bar')['baz'] A B C one 1 2 3 two 4 5 6 - Returns - ------- - pivoted : DataFrame - If no values column specified, will have hierarchically indexed - columns + """ from pandas.core.reshape import pivot return pivot(self, index=index, columns=columns, values=values) From 233d51dd099f590379bd9144c40a61f6785b2b57 Mon Sep 17 00:00:00 2001 From: Sinhrks Date: Sat, 22 Oct 2016 18:54:33 +0900 Subject: [PATCH 011/183] BUG: String indexing against object dtype may raise AttributeError (#14424) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/indexes/base.py | 5 +++++ pandas/tests/indexing/test_indexing.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 4b2aae2277079..147ff8795eb00 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -40,6 +40,7 @@ Bug Fixes +- Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 1c24a0db34b2b..10c4d823bd9f3 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2966,6 +2966,11 @@ def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None return Index(joined, name=name) + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + # this is for partial string indexing, + # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex + raise NotImplementedError + def slice_indexer(self, start=None, end=None, step=None, kind=None): """ For an ordered Index, compute the slice indexer for input labels and diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index fa406a27bef69..a50d3d28e5a11 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -3613,6 +3613,27 @@ def test_iloc_non_unique_indexing(self): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) + def test_string_slice(self): + # GH 14424 + # string indexing against datetimelike with object + # dtype should properly raises KeyError + df = pd.DataFrame([1], pd.Index([pd.Timestamp('2011-01-01')], + dtype=object)) + self.assertTrue(df.index.is_all_dates) + with tm.assertRaises(KeyError): + df['2011'] + + with tm.assertRaises(KeyError): + df.loc['2011', 0] + + df = pd.DataFrame() + self.assertFalse(df.index.is_all_dates) + with tm.assertRaises(KeyError): + df['2011'] + + with tm.assertRaises(KeyError): + df.loc['2011', 0] + def test_mi_access(self): # GH 4145 From 8f54e3573cdcc67c0dd29a19f83a08ba466b2f3b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 24 Oct 2016 18:09:38 -0400 Subject: [PATCH 012/183] MAINT: Use check_output when merging. Since we don't support Python 2.6 anymore, the `check_output` method from `subprocess` is at our disposal. Follow-up to #14447. xref #14439 (comment) Author: gfyoung Closes #14465 from gfyoung/merge-pr-refactor and squashes the following commits: e267d2b [gfyoung] MAINT: Use check_output when merging. --- scripts/merge-py.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/scripts/merge-py.py b/scripts/merge-py.py index 5d90a006c09c9..b9350f8feceb8 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-py.py @@ -25,12 +25,12 @@ from __future__ import print_function +from subprocess import check_output from requests.auth import HTTPBasicAuth import requests import os import six -import subprocess import sys import textwrap @@ -83,21 +83,10 @@ def fail(msg): def run_cmd(cmd): - # py2.6 does not have subprocess.check_output if isinstance(cmd, six.string_types): cmd = cmd.split(' ') - popenargs = [cmd] - kwargs = {} - - process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs) - output, unused_err = process.communicate() - retcode = process.poll() - if retcode: - cmd = kwargs.get("args") - if cmd is None: - cmd = popenargs[0] - raise subprocess.CalledProcessError(retcode, cmd, output=output) + output = check_output(cmd) if isinstance(output, six.binary_type): output = output.decode('utf-8') From 13088842a7218e8e4626ab68f0c4f204f25f0ba4 Mon Sep 17 00:00:00 2001 From: Thiago Serafim Date: Mon, 24 Oct 2016 18:10:51 -0400 Subject: [PATCH 013/183] ERR: Fix GH13139: better error message on invalid pd.eval and df.query input closes #13139 Added test case to check for invalid input(empy string) on pd.eval('') and df.query(''). Used existing helper function(_check_expression) Author: Thiago Serafim Closes #14473 from tserafim/issue#13139 and squashes the following commits: 77483dd [Thiago Serafim] ERR: correctly raise ValueError on empty input to pd.eval() and df.query() (#13139) 9a5c55f [Thiago Serafim] Fix GH13139: better error message on invalid pd.eval and df.query input --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/computation/eval.py | 1 + pandas/computation/tests/test_eval.py | 12 ++++++++++++ pandas/tests/frame/test_query_eval.py | 8 ++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 147ff8795eb00..1940c841c9a37 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -41,7 +41,7 @@ Bug Fixes - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - +- Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 6c5c631a6bf0e..fffde4d9db867 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -233,6 +233,7 @@ def eval(expr, parser='pandas', engine=None, truediv=True, """ first_expr = True if isinstance(expr, string_types): + _check_expression(expr) exprs = [e for e in expr.splitlines() if e != ''] else: exprs = [expr] diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index f480eae2dd04d..ffa2cb0684b72 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1891,6 +1891,18 @@ def test_bad_resolver_raises(): yield check_bad_resolver_raises, engine, parser +def check_empty_string_raises(engine, parser): + # GH 13139 + tm.skip_if_no_ne(engine) + with tm.assertRaisesRegexp(ValueError, 'expr cannot be an empty string'): + pd.eval('', engine=engine, parser=parser) + + +def test_empty_string_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_empty_string_raises, engine, parser + + def check_more_than_one_expression_raises(engine, parser): tm.skip_if_no_ne(engine) with tm.assertRaisesRegexp(SyntaxError, diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 85159de64d83e..29662c5addb75 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -147,6 +147,14 @@ def test_query_non_str(self): with tm.assertRaisesRegexp(ValueError, msg): df.query(111) + def test_query_empty_string(self): + # GH 13139 + df = pd.DataFrame({'A': [1, 2, 3]}) + + msg = "expr cannot be an empty string" + with tm.assertRaisesRegexp(ValueError, msg): + df.query('') + def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame(randn(10, 2), columns=list('ab')) From fe2ebc15d696f02dc3137c0d0c318c7bca6abb7c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Oct 2016 18:17:20 -0400 Subject: [PATCH 014/183] BUG: fix empty intersection of RangeIndex (GH14364) closes #14364 Author: Joris Van den Bossche Closes #14481 from jorisvandenbossche/bug-rangeindex-empty and squashes the following commits: 823e83d [Joris Van den Bossche] BUG: fix empty intersection of RangeIndex (GH14364) --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/indexes/range.py | 7 +++++-- pandas/tests/indexes/test_range.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 1940c841c9a37..d5fa2af5b0ff6 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -43,7 +43,7 @@ Bug Fixes - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) - +- Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 76166e7155bd0..7a7902b503bd6 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -315,6 +315,9 @@ def intersection(self, other): if not isinstance(other, RangeIndex): return super(RangeIndex, self).intersection(other) + if not len(self) or not len(other): + return RangeIndex._simple_new(None) + # check whether intervals intersect # deals with in- and decreasing ranges int_low = max(min(self._start, self._stop + 1), @@ -322,7 +325,7 @@ def intersection(self, other): int_high = min(max(self._stop, self._start + 1), max(other._stop, other._start + 1)) if int_high <= int_low: - return RangeIndex() + return RangeIndex._simple_new(None) # Method hint: linear Diophantine equation # solve intersection problem @@ -332,7 +335,7 @@ def intersection(self, other): # check whether element sets intersect if (self._start - other._start) % gcd: - return RangeIndex() + return RangeIndex._simple_new(None) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b0b8864521666..26d50aa55431f 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -587,6 +587,35 @@ def test_intersection(self): other.values))) self.assert_index_equal(result, expected) + index = RangeIndex(5) + + # intersect of non-overlapping indices + other = RangeIndex(5, 10, 1) + result = index.intersection(other) + expected = RangeIndex(0, 0, 1) + self.assert_index_equal(result, expected) + + other = RangeIndex(-1, -5, -1) + result = index.intersection(other) + expected = RangeIndex(0, 0, 1) + self.assert_index_equal(result, expected) + + # intersection of empty indices + other = RangeIndex(0, 0, 1) + result = index.intersection(other) + expected = RangeIndex(0, 0, 1) + self.assert_index_equal(result, expected) + + result = other.intersection(index) + self.assert_index_equal(result, expected) + + # intersection of non-overlapping values based on start value and gcd + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) + result = index.intersection(other) + expected = RangeIndex(0, 0, 1) + self.assert_index_equal(result, expected) + def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] From 192b1cd510948cefff6d8b6c34655ca4828bc95b Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 22 Oct 2016 06:37:05 -0500 Subject: [PATCH 015/183] BLD: don't require cython on sdist install closes #14475 closes #14204 --- doc/source/whatsnew/v0.19.1.txt | 3 +++ setup.py | 34 ++++++++++++++++----------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index d5fa2af5b0ff6..66868e47a5f4f 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -45,6 +45,9 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). + +- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) + - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) diff --git a/setup.py b/setup.py index 846e2b7fa2d88..fdf24585576bc 100755 --- a/setup.py +++ b/setup.py @@ -125,25 +125,25 @@ def is_platform_mac(): class build_ext(_build_ext): def build_extensions(self): - if not cython: - raise ImportError('Building pandas requires cython') - - for pxifile in _pxifiles: - # build pxifiles first, template extention must be .pxi.in - assert pxifile.endswith('.pxi.in') - outfile = pxifile[:-3] - - if (os.path.exists(outfile) and - os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): - # if .pxi.in is not updated, no need to output .pxi - continue + # if builing from c files, don't need to + # generate template output + if cython: + for pxifile in _pxifiles: + # build pxifiles first, template extention must be .pxi.in + assert pxifile.endswith('.pxi.in') + outfile = pxifile[:-3] + + if (os.path.exists(outfile) and + os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): + # if .pxi.in is not updated, no need to output .pxi + continue - with open(pxifile, "r") as f: - tmpl = f.read() - pyxcontent = tempita.sub(tmpl) + with open(pxifile, "r") as f: + tmpl = f.read() + pyxcontent = tempita.sub(tmpl) - with open(outfile, "w") as f: - f.write(pyxcontent) + with open(outfile, "w") as f: + f.write(pyxcontent) numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') From 18fba53089fdfa3075cb9faa1f3ac57a2146be9b Mon Sep 17 00:00:00 2001 From: Keshav Ramaswamy Date: Mon, 24 Oct 2016 18:25:29 -0400 Subject: [PATCH 016/183] DOC: updated docstring in .to_datetime() for out-of-bounds timestamps with errors='ignore' closes #14448 Author: Keshav Ramaswamy Author: Keshav Ramaswamy Closes #14452 from keshavramaswamy/master and squashes the following commits: 5468fc5 [Keshav Ramaswamy] added link to timestamp limitations in docstring 3aa78cf [Keshav Ramaswamy] edit docstring to fit python standards 8bfa58e [Keshav Ramaswamy] edited docstring to fit python standards 5ed8ef5 [Keshav Ramaswamy] edut docstring to fit Python Standards 7402de4 [Keshav Ramaswamy] edited docstring to fit python standards c16ad6b [Keshav Ramaswamy] added timestamp limitations and default behavior to docstring 0db07b5 [Keshav Ramaswamy] updated docstring in .to_datetime() --- pandas/tseries/tools.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 637e70b76de98..326bc5be3fd8f 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -42,6 +42,7 @@ def _infer(a, b): raise AssertionError('Inputs must both have the same timezone,' ' {0} != {1}'.format(tz, b.tzinfo)) return tz + tz = None if start is not None: tz = _infer(start, end) @@ -264,10 +265,15 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 2016-03-05 dtype: datetime64[ns] - If a date that does not meet timestamp limitations, passing errors='coerce' - will force to NaT. Furthermore this will force non-dates to NaT as well. + If a date does not meet the `timestamp limitations + `_, passing errors='ignore' + will return the original input instead of raising any exception. + + Passing errors='coerce' will force an out-of-bounds date to NaT, + in addition to forcing non-dates (or non-parseable dates) to NaT. - >>> pd.to_datetime('13000101', format='%Y%m%d') + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT @@ -420,6 +426,7 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): return _convert_listlike(np.array([arg]), box, format)[0] + # mappings for assembling units _unit_map = {'year': 'year', 'years': 'year', @@ -552,7 +559,7 @@ def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslib.iNaT - result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).\ + result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)). \ astype('M8[ns]') return result @@ -637,7 +644,6 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): DateParseError = tslib.DateParseError normalize_date = tslib.normalize_date - # Fixed time formats for time parsing _time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] @@ -763,6 +769,7 @@ def format(dt): """Returns date in YYYYMMDD format.""" return dt.strftime('%Y%m%d') + OLE_TIME_ZERO = datetime(1899, 12, 30, 0, 0, 0) From bee90a7c50576b0160db55fb325908040233e92d Mon Sep 17 00:00:00 2001 From: David Krych Date: Mon, 24 Oct 2016 18:31:02 -0400 Subject: [PATCH 017/183] BUG: GH14323 Union of differences from DatetimeIndex incorrect closes #14323 Sets freq to None when doing a difference operation on a DatetimeIndex or TimedeltaIndex, rather than retaining the frequency (which can cause problems with downstream operations). Frequency of PeriodIndex is retained. Author: David Krych Closes #14346 from Liam3851/dtind_diff_14323 and squashes the following commits: 1dbf582 [David Krych] BUG: GH14323 Union of differences from DatetimeIndex incorrect --- doc/source/whatsnew/v0.19.1.txt | 2 + pandas/indexes/base.py | 2 +- pandas/tests/indexes/test_datetimelike.py | 74 +++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 66868e47a5f4f..09eb0f389dcf4 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -44,6 +44,8 @@ Bug Fixes - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). +- Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) + - Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 10c4d823bd9f3..4d2dcd259e623 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2003,7 +2003,7 @@ def difference(self, other): except TypeError: pass - return this._shallow_copy(the_diff, name=result_name) + return this._shallow_copy(the_diff, name=result_name, freq=None) def symmetric_difference(self, other, result_name=None): """ diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 7502a4ce26b04..b04e840ffc849 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -732,6 +732,31 @@ def test_fillna_datetime64(self): dtype=object) self.assert_index_equal(idx.fillna('x'), exp) + def test_difference_of_union(self): + # GH14323: Test taking the union of differences of an Index. + # Difference of DatetimeIndex does not preserve frequency, + # so a differencing operation should not retain the freq field of the + # original index. + i = pd.date_range("20160920", "20160925", freq="D") + + a = pd.date_range("20160921", "20160924", freq="D") + expected = pd.DatetimeIndex(["20160920", "20160925"], freq=None) + a_diff = i.difference(a) + tm.assert_index_equal(a_diff, expected) + tm.assert_attr_equal('freq', a_diff, expected) + + b = pd.date_range("20160922", "20160925", freq="D") + b_diff = i.difference(b) + expected = pd.DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(b_diff, expected) + tm.assert_attr_equal('freq', b_diff, expected) + + union_of_diff = a_diff.union(b_diff) + expected = pd.DatetimeIndex(["20160920", "20160921", "20160925"], + freq=None) + tm.assert_index_equal(union_of_diff, expected) + tm.assert_attr_equal('freq', union_of_diff, expected) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex @@ -938,6 +963,30 @@ def test_no_millisecond_field(self): with self.assertRaises(AttributeError): DatetimeIndex([]).millisecond + def test_difference_of_union(self): + # GH14323: Test taking the union of differences of an Index. + # Difference of Period MUST preserve frequency, but the ability + # to union results must be preserved + i = pd.period_range("20160920", "20160925", freq="D") + + a = pd.period_range("20160921", "20160924", freq="D") + expected = pd.PeriodIndex(["20160920", "20160925"], freq='D') + a_diff = i.difference(a) + tm.assert_index_equal(a_diff, expected) + tm.assert_attr_equal('freq', a_diff, expected) + + b = pd.period_range("20160922", "20160925", freq="D") + b_diff = i.difference(b) + expected = pd.PeriodIndex(["20160920", "20160921"], freq='D') + tm.assert_index_equal(b_diff, expected) + tm.assert_attr_equal('freq', b_diff, expected) + + union_of_diff = a_diff.union(b_diff) + expected = pd.PeriodIndex(["20160920", "20160921", "20160925"], + freq='D') + tm.assert_index_equal(union_of_diff, expected) + tm.assert_attr_equal('freq', union_of_diff, expected) + class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex @@ -1149,3 +1198,28 @@ def test_fillna_timedelta(self): exp = pd.Index( [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) + + def test_difference_of_union(self): + # GH14323: Test taking the union of differences of an Index. + # Difference of TimedeltaIndex does not preserve frequency, + # so a differencing operation should not retain the freq field of the + # original index. + i = pd.timedelta_range("0 days", "5 days", freq="D") + + a = pd.timedelta_range("1 days", "4 days", freq="D") + expected = pd.TimedeltaIndex(["0 days", "5 days"], freq=None) + a_diff = i.difference(a) + tm.assert_index_equal(a_diff, expected) + tm.assert_attr_equal('freq', a_diff, expected) + + b = pd.timedelta_range("2 days", "5 days", freq="D") + b_diff = i.difference(b) + expected = pd.TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(b_diff, expected) + tm.assert_attr_equal('freq', b_diff, expected) + + union_of_difference = a_diff.union(b_diff) + expected = pd.TimedeltaIndex(["0 days", "1 days", "5 days"], + freq=None) + tm.assert_index_equal(union_of_difference, expected) + tm.assert_attr_equal('freq', union_of_difference, expected) From 5cf6d9406cd3ab43427a950719e212194f3de7c7 Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Mon, 24 Oct 2016 18:38:12 -0400 Subject: [PATCH 018/183] BUG: block mutation of read-only array in series Author: Joe Jevnik Closes #14359 from llllllllll/series-setitem and squashes the following commits: 9925327 [Joe Jevnik] BUG: fix a bug in Series.__setitem__ that allowed the mutatation of read-only arrays --- doc/source/whatsnew/v0.19.1.txt | 4 ++-- pandas/lib.pyx | 12 +++++++--- pandas/src/util.pxd | 15 +++++++++++- pandas/tests/series/test_indexing.py | 34 ++++++++++++++++++++++++++++ setup.py | 3 ++- 5 files changed, 61 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 09eb0f389dcf4..e5310a272c351 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -46,6 +46,7 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). - Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Bug in ``Series.__setitem__`` which allowed mutating read-only arrays (:issue:`14359`). - Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) @@ -62,5 +63,4 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) -- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` - is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index ef3407ffd5388..b09a1c2755a06 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -975,7 +975,9 @@ def astype_intsafe(ndarray[object] arr, new_dtype): if is_datelike and checknull(v): result[i] = NPY_NAT else: - util.set_value_at(result, i, v) + # we can use the unsafe version because we know `result` is mutable + # since it was created from `np.empty` + util.set_value_at_unsafe(result, i, v) return result @@ -986,7 +988,9 @@ cpdef ndarray[object] astype_unicode(ndarray arr): ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - util.set_value_at(result, i, unicode(arr[i])) + # we can use the unsafe version because we know `result` is mutable + # since it was created from `np.empty` + util.set_value_at_unsafe(result, i, unicode(arr[i])) return result @@ -997,7 +1001,9 @@ cpdef ndarray[object] astype_str(ndarray arr): ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - util.set_value_at(result, i, str(arr[i])) + # we can use the unsafe version because we know `result` is mutable + # since it was created from `np.empty` + util.set_value_at_unsafe(result, i, str(arr[i])) return result diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index fdbfbf62af7d2..be8d0d4aa6302 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -70,7 +70,12 @@ cdef inline object get_value_at(ndarray arr, object loc): return get_value_1d(arr, i) -cdef inline set_value_at(ndarray arr, object loc, object value): +cdef inline set_value_at_unsafe(ndarray arr, object loc, object value): + """Sets a value into the array without checking the writeable flag. + + This should be used when setting values in a loop, check the writeable + flag above the loop and then eschew the check on each iteration. + """ cdef: Py_ssize_t i, sz if is_float_object(loc): @@ -87,6 +92,14 @@ cdef inline set_value_at(ndarray arr, object loc, object value): assign_value_1d(arr, i, value) +cdef inline set_value_at(ndarray arr, object loc, object value): + """Sets a value into the array after checking that the array is mutable. + """ + if not cnp.PyArray_ISWRITEABLE(arr): + raise ValueError('assignment destination is read-only') + + set_value_at_unsafe(arr, loc, value) + cdef inline int is_contiguous(ndarray arr): return cnp.PyArray_CHKFLAGS(arr, cnp.NPY_C_CONTIGUOUS) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 7c16fd060b181..c44a7a898bb8d 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1947,6 +1947,40 @@ def test_multilevel_preserve_name(self): self.assertEqual(result.name, s.name) self.assertEqual(result2.name, s.name) + def test_setitem_scalar_into_readonly_backing_data(self): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + for n in range(len(series)): + with self.assertRaises(ValueError): + series[n] = 1 + + self.assertEqual( + array[n], + 0, + msg='even though the ValueError was raised, the underlying' + ' array was still mutated!', + ) + + def test_setitem_slice_into_readonly_backing_data(self): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + with self.assertRaises(ValueError): + series[1:3] = 1 + + self.assertTrue( + not array.any(), + msg='even though the ValueError was raised, the underlying' + ' array was still mutated!', + ) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/setup.py b/setup.py index fdf24585576bc..3f8667cd6fe42 100755 --- a/setup.py +++ b/setup.py @@ -476,7 +476,8 @@ def pxd(name): 'pandas/src/period_helper.c']}, index={'pyxfile': 'index', 'sources': ['pandas/src/datetime/np_datetime.c', - 'pandas/src/datetime/np_datetime_strings.c']}, + 'pandas/src/datetime/np_datetime_strings.c'], + 'pxdfiles': ['src/util']}, algos={'pyxfile': 'algos', 'pxdfiles': ['src/util'], 'depends': _pxi_dep['algos']}, From 2e77536bdf90ef20fefd4eab751447918e07668f Mon Sep 17 00:00:00 2001 From: paul-mannino Date: Sun, 9 Oct 2016 17:45:34 -0500 Subject: [PATCH 019/183] BUG: Fix issue with inserting duplicate columns in a dataframe closes #14291 closes #14431 --- doc/source/whatsnew/v0.19.1.txt | 8 ++++++ pandas/core/frame.py | 30 +++++++++++++++----- pandas/sparse/frame.py | 16 ++++++++++- pandas/tests/frame/test_nonunique_indexes.py | 10 +++++++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index e5310a272c351..8bebe5e782e3c 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -50,6 +50,7 @@ Bug Fixes - Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) +- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) @@ -63,4 +64,11 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) + + + + + + + - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfe7e90c134fc..05148c1f7e80a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2487,7 +2487,7 @@ def _set_item(self, key, value): # check if we are modifying a copy # try to set first as we want an invalid - # value exeption to occur first + # value exception to occur first if len(self): self._check_setitem_copy() @@ -2503,10 +2503,10 @@ def insert(self, loc, column, value, allow_duplicates=False): loc : int Must have 0 <= loc <= len(columns) column : object - value : int, Series, or array-like + value : scalar, Series, or array-like """ self._ensure_valid_index(value) - value = self._sanitize_column(column, value) + value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) @@ -2590,9 +2590,25 @@ def assign(self, **kwargs): return data - def _sanitize_column(self, key, value): - # Need to make sure new columns (which go into the BlockManager as new - # blocks) are always copied + def _sanitize_column(self, key, value, broadcast=True): + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied and converted into an array. + + Parameters + ---------- + key : object + value : scalar, Series, or array-like + broadcast : bool, default True + If ``key`` matches multiple duplicate column names in the + DataFrame, this parameter indicates whether ``value`` should be + tiled so that the returned array contains a (duplicated) column for + each occurrence of the key. If False, ``value`` will not be tiled. + + Returns + ------- + sanitized_column : numpy-array + """ def reindexer(value): # reindex if necessary @@ -2665,7 +2681,7 @@ def reindexer(value): return value # broadcast across multiple columns if necessary - if key in self.columns and value.ndim == 1: + if broadcast and key in self.columns and value.ndim == 1: if (not self.columns.is_unique or isinstance(self.columns, MultiIndex)): existing_piece = self[key] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 8eeff045d1fac..56020e32b9963 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -302,7 +302,21 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, # ---------------------------------------------------------------------- # Support different internal representation of SparseDataFrame - def _sanitize_column(self, key, value): + def _sanitize_column(self, key, value, **kwargs): + """ + Creates a new SparseArray from the input value. + + Parameters + ---------- + key : object + value : scalar, Series, or array-like + kwargs : dict + + Returns + ------- + sanitized_column : SparseArray + + """ sp_maker = lambda x, index=None: SparseArray( x, index=index, fill_value=self._default_fill_value, kind=self._default_kind) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 77974718714f8..220d29f624942 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -468,3 +468,13 @@ def test_set_value_by_index(self): df.iloc[:, 0] = 3 assert_series_equal(df.iloc[:, 1], expected) + + def test_insert_with_columns_dups(self): + # GH 14291 + df = pd.DataFrame() + df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True) + df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True) + df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) + exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'], + ['c', 'f', 'i']], columns=['A', 'A', 'A']) + assert_frame_equal(df, exp) From 6ff53c2b47b026f605e415d3cd5f3b0dda7e0774 Mon Sep 17 00:00:00 2001 From: Nicholas Ver Halen Date: Tue, 25 Oct 2016 06:41:37 -0400 Subject: [PATCH 020/183] BUG: downcast='unsigned' on 0 would would not downcast to unsigned. closes #14401 Author: Nicholas Ver Halen Closes #14472 from verhalenn/issue14401 and squashes the following commits: 21e1a97 [Nicholas Ver Halen] Made the downcast limits test its own function. a62a3f8 [Nicholas Ver Halen] Revert "Added release note to issue 14401 resolve." 7413a9c [Nicholas Ver Halen] Did some editing for some lint problems. 26fe3a1 [Nicholas Ver Halen] Made the dictionaries into one list of sets. 5023fc7 [Nicholas Ver Halen] Changed the test to work with python 3.x 9ccc991 [Nicholas Ver Halen] Changed the tests so that it iterated through a dictionary. ef35e19 [Nicholas Ver Halen] Added tests for the max and min values of all dtypes to to_numeric cc278ff [Nicholas Ver Halen] Added release note to issue 14401 resolve. 11a421d [Nicholas Ver Halen] Added a test to check uint8 with 0 11038f8 [Nicholas Ver Halen] Made it so that 0 was included in uint8 --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/tools/tests/test_util.py | 46 +++++++++++++++++++++++++++++++++ pandas/tools/util.py | 2 +- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 8bebe5e782e3c..d1bb0ed4a69a1 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -57,6 +57,7 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) +- Bug in ``pd.to_numeric`` where 0 was not included when ``downcast='unsigned'`` is passed (:issue:`14401`) - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 8c16308d79a31..ddd408202bcfc 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -401,6 +401,52 @@ def test_downcast(self): res = pd.to_numeric(data, downcast=downcast) tm.assert_numpy_array_equal(res, expected) + def test_downcast_limits(self): + # Test the limits of each downcast. #14401 + # uint64 is not fully supported ATM + dtype_downcast_min_max = [ + ('int8', 'integer', + [np.iinfo(np.int8).min, np.iinfo(np.int8).max]), + ('int16', 'integer', + [np.iinfo(np.int16).min, np.iinfo(np.int16).max]), + ('int32', 'integer', + [np.iinfo(np.int32).min, np.iinfo(np.int32).max]), + ('int64', 'integer', + [np.iinfo(np.int64).min, np.iinfo(np.int64).max]), + ('uint8', 'unsigned', + [np.iinfo(np.uint8).min, np.iinfo(np.uint8).max]), + ('uint16', 'unsigned', + [np.iinfo(np.uint16).min, np.iinfo(np.uint16).max]), + ('uint32', 'unsigned', + [np.iinfo(np.uint32).min, np.iinfo(np.uint32).max]), + # ('uint64', 'unsigned', + # [np.iinfo(np.uint64).min, np.iinfo(np.uint64).max]), + + ('int16', 'integer', + [np.iinfo(np.int8).min, np.iinfo(np.int8).max + 1]), + ('int32', 'integer', + [np.iinfo(np.int16).min, np.iinfo(np.int16).max + 1]), + ('int64', 'integer', + [np.iinfo(np.int32).min, np.iinfo(np.int32).max + 1]), + ('int16', 'integer', + [np.iinfo(np.int8).min - 1, np.iinfo(np.int16).max]), + ('int32', 'integer', + [np.iinfo(np.int16).min - 1, np.iinfo(np.int32).max]), + ('int64', 'integer', + [np.iinfo(np.int32).min - 1, np.iinfo(np.int64).max]), + ('uint16', 'unsigned', + [np.iinfo(np.uint8).min, np.iinfo(np.uint8).max + 1]), + ('uint32', 'unsigned', + [np.iinfo(np.uint16).min, np.iinfo(np.uint16).max + 1]), + # ('uint64', 'unsigned', + # [np.iinfo(np.uint32).min, np.iinfo(np.uint32).max + 1]), + ] + + for dtype, downcast, min_max in dtype_downcast_min_max: + series = pd.to_numeric(pd.Series(min_max), downcast=downcast) + tm.assert_equal(series.dtype, dtype) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index fec56328c1721..b50bf9dc448bc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -205,7 +205,7 @@ def to_numeric(arg, errors='raise', downcast=None): if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) > 0: + elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] From 48520083ae27eefb9a918b430523151df9166704 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 25 Oct 2016 06:49:24 -0400 Subject: [PATCH 021/183] Bug: Error when key-only Grouper is passed to groupby in a list (GH14334) closes #14334 Author: Jon M. Mease Closes #14342 from jmmease/bug_14334 and squashes the following commits: 5e96797 [Jon M. Mease] Add tests for grouping on two columns cee5ce6 [Jon M. Mease] Added bug description to new test case f9ef05b [Jon M. Mease] Moved whatsnew to 0.19.1 and clarified description 14a4ae6 [Jon M. Mease] Added whatsnew for GH 14334 9805c30 [Jon M. Mease] Fix for GH 14334 dfd3e09 [Jon M. Mease] Added test case for GH 14334 --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/groupby.py | 10 ++++++++-- pandas/tests/test_groupby.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index d1bb0ed4a69a1..d79332f193bc4 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -64,6 +64,7 @@ Bug Fixes - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) +- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5223c0ac270f3..5e08f6c3368a6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2208,7 +2208,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, index._get_grouper_for_level(self.grouper, level) else: - if isinstance(self.grouper, (list, tuple)): + if self.grouper is None and self.name is not None: + self.grouper = self.obj[self.name] + + elif isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) # a passed Categorical @@ -2448,7 +2451,10 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - + elif isinstance(gpr, Grouper) and gpr.key is not None: + # Add key to exclusions + exclusions.append(gpr.key) + in_axis, name = False, None else: in_axis, name = False, None diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f3791ee1d5c91..89aaafe9b2c02 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -442,6 +442,36 @@ def test_grouper_creation_bug(self): result = g.sum() assert_frame_equal(result, expected) + # GH14334 + # pd.Grouper(key=...) may be passed in a list + df = DataFrame({'A': [0, 0, 0, 1, 1, 1], + 'B': [1, 1, 2, 2, 3, 3], + 'C': [1, 2, 3, 4, 5, 6]}) + # Group by single column + expected = df.groupby('A').sum() + g = df.groupby([pd.Grouper(key='A')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group by two columns + # using a combination of strings and Grouper objects + expected = df.groupby(['A', 'B']).sum() + + # Group with two Grouper objects + g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a string and a Grouper object + g = df.groupby(['A', pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a Grouper object and a string + g = df.groupby([pd.Grouper(key='A'), 'B']) + result = g.sum() + assert_frame_equal(result, expected) + # GH8866 s = Series(np.arange(8, dtype='int64'), index=pd.MultiIndex.from_product( From f99f050aaf29e4b7e9190488904c12bb719f8210 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 25 Oct 2016 06:52:36 -0400 Subject: [PATCH 022/183] BUG: incorrect broadcasting that could casuse dtype coercion in a groupby-transform closes #14457 Author: Jeff Reback Closes #14466 from jreback/transform and squashes the following commits: ce595b9 [Jeff Reback] BUG: incorrect broadcasting that could casuse dtype coercion in a groupby-transform --- doc/source/whatsnew/v0.19.1.txt | 4 ++++ pandas/core/groupby.py | 21 +++++++++++++++------ pandas/tests/test_groupby.py | 12 ++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index d79332f193bc4..3256e017b4df4 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -43,9 +43,13 @@ Bug Fixes - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) + - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). - Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) + + - Bug in ``Series.__setitem__`` which allowed mutating read-only arrays (:issue:`14359`). diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5e08f6c3368a6..2a7f896e1b871 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3460,7 +3460,6 @@ def _transform_general(self, func, *args, **kwargs): from pandas.tools.merge import concat applied = [] - obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) @@ -3481,14 +3480,24 @@ def _transform_general(self, func, *args, **kwargs): else: res = path(group) - # broadcasting if isinstance(res, Series): - if res.index.is_(obj.index): - group.T.values[:] = res + + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if not np.prod(group.shape): + continue + elif res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index else: - group.values[:] = res + r = DataFrame( + np.concatenate([res.values] * len(group.index) + ).reshape(group.shape), + columns=group.columns, index=group.index) - applied.append(group) + applied.append(r) else: applied.append(res) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 89aaafe9b2c02..dc326aeaa88ac 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1366,6 +1366,18 @@ def nsum(x): for result in results: assert_series_equal(result, expected, check_names=False) + def test_transform_coercion(self): + + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) + g = df.groupby('A') + + expected = g.transform(np.mean) + result = g.transform(lambda x: np.mean(x)) + assert_frame_equal(result, expected) + def test_with_na(self): index = Index(np.arange(10)) From d1d75d7fcb6b5d090cf96d328d0136c3959a82b8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 25 Oct 2016 18:18:53 -0400 Subject: [PATCH 023/183] Revert "BUG: downcast='unsigned' on 0 would would not downcast to unsigned." This reverts commit 6ff53c2b47b026f605e415d3cd5f3b0dda7e0774. --- doc/source/whatsnew/v0.19.1.txt | 1 - pandas/tools/tests/test_util.py | 46 --------------------------------- pandas/tools/util.py | 2 +- 3 files changed, 1 insertion(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3256e017b4df4..8f5f78a5e93f7 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -61,7 +61,6 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) -- Bug in ``pd.to_numeric`` where 0 was not included when ``downcast='unsigned'`` is passed (:issue:`14401`) - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index ddd408202bcfc..8c16308d79a31 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -401,52 +401,6 @@ def test_downcast(self): res = pd.to_numeric(data, downcast=downcast) tm.assert_numpy_array_equal(res, expected) - def test_downcast_limits(self): - # Test the limits of each downcast. #14401 - # uint64 is not fully supported ATM - dtype_downcast_min_max = [ - ('int8', 'integer', - [np.iinfo(np.int8).min, np.iinfo(np.int8).max]), - ('int16', 'integer', - [np.iinfo(np.int16).min, np.iinfo(np.int16).max]), - ('int32', 'integer', - [np.iinfo(np.int32).min, np.iinfo(np.int32).max]), - ('int64', 'integer', - [np.iinfo(np.int64).min, np.iinfo(np.int64).max]), - ('uint8', 'unsigned', - [np.iinfo(np.uint8).min, np.iinfo(np.uint8).max]), - ('uint16', 'unsigned', - [np.iinfo(np.uint16).min, np.iinfo(np.uint16).max]), - ('uint32', 'unsigned', - [np.iinfo(np.uint32).min, np.iinfo(np.uint32).max]), - # ('uint64', 'unsigned', - # [np.iinfo(np.uint64).min, np.iinfo(np.uint64).max]), - - ('int16', 'integer', - [np.iinfo(np.int8).min, np.iinfo(np.int8).max + 1]), - ('int32', 'integer', - [np.iinfo(np.int16).min, np.iinfo(np.int16).max + 1]), - ('int64', 'integer', - [np.iinfo(np.int32).min, np.iinfo(np.int32).max + 1]), - ('int16', 'integer', - [np.iinfo(np.int8).min - 1, np.iinfo(np.int16).max]), - ('int32', 'integer', - [np.iinfo(np.int16).min - 1, np.iinfo(np.int32).max]), - ('int64', 'integer', - [np.iinfo(np.int32).min - 1, np.iinfo(np.int64).max]), - ('uint16', 'unsigned', - [np.iinfo(np.uint8).min, np.iinfo(np.uint8).max + 1]), - ('uint32', 'unsigned', - [np.iinfo(np.uint16).min, np.iinfo(np.uint16).max + 1]), - # ('uint64', 'unsigned', - # [np.iinfo(np.uint32).min, np.iinfo(np.uint32).max + 1]), - ] - - for dtype, downcast, min_max in dtype_downcast_min_max: - series = pd.to_numeric(pd.Series(min_max), downcast=downcast) - tm.assert_equal(series.dtype, dtype) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index b50bf9dc448bc..fec56328c1721 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -205,7 +205,7 @@ def to_numeric(arg, errors='raise', downcast=None): if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) >= 0: + elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] From e3d943d1876e8914036e9c323f1876e52da34a04 Mon Sep 17 00:00:00 2001 From: Larry Ren Date: Wed, 26 Oct 2016 09:16:57 +0100 Subject: [PATCH 024/183] PERF: performance regression in Series.asof (#14476) * Fix performance regression in Series.asof by avoiding pre-computing nulls and returning value by indexing the underlying ndarray. --- asv_bench/benchmarks/timeseries.py | 81 +++++++++++++++++++----------- doc/source/whatsnew/v0.19.1.txt | 5 +- pandas/core/generic.py | 34 ++++++++----- 3 files changed, 76 insertions(+), 44 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index fda6ebb4b437e..8c00924cb07ef 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -284,56 +284,77 @@ class timeseries_asof(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.ts2 = self.ts.copy() + self.ts2[250:5000] = np.nan + self.ts3 = self.ts.copy() + self.ts3[-5000:] = np.nan - def time_timeseries_asof(self): + # test speed of pre-computing NAs. + def time_asof_list(self): self.ts.asof(self.dates) + # should be roughly the same as above. + def time_asof_nan_list(self): + self.ts2.asof(self.dates) -class timeseries_asof_nan(object): - goal_time = 0.2 + # test speed of the code path for a scalar index + # without *while* loop + def time_asof_single(self): + self.ts.asof(self.dates[0]) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') - self.ts[250:5000] = np.nan + # test speed of the code path for a scalar index + # before the start. should be the same as above. + def time_asof_single_early(self): + self.ts.asof(self.dates[0] - dt.timedelta(10)) - def time_timeseries_asof_nan(self): - self.ts.asof(self.dates) + # test the speed of the code path for a scalar index + # with a long *while* loop. should still be much + # faster than pre-computing all the NAs. + def time_asof_nan_single(self): + self.ts3.asof(self.dates[-1]) -class timeseries_asof_single(object): +class timeseries_dataframe_asof(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.N = 10000 + self.M = 100 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) + self.ts2 = self.ts.copy() + self.ts2.iloc[250:5000] = np.nan + self.ts3 = self.ts.copy() + self.ts3.iloc[-5000:] = np.nan + + # test speed of pre-computing NAs. + def time_asof_list(self): + self.ts.asof(self.dates) - def time_timeseries_asof_single(self): + # should be roughly the same as above. + def time_asof_nan_list(self): + self.ts2.asof(self.dates) + + # test speed of the code path for a scalar index + # with pre-computing all NAs. + def time_asof_single(self): self.ts.asof(self.dates[0]) + # should be roughly the same as above. + def time_asof_nan_single(self): + self.ts3.asof(self.dates[-1]) + + # test speed of the code path for a scalar index + # before the start. should be without the cost of + # pre-computing all the NAs. + def time_asof_single_early(self): + self.ts.asof(self.dates[0] - dt.timedelta(10)) + class timeseries_custom_bday_apply(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 8f5f78a5e93f7..3ee4cc1dde92d 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -21,8 +21,9 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) -- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - +- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) +- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) +- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 697438df87d4f..037ab900e6150 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3735,10 +3735,10 @@ def asof(self, where, subset=None): if not self.index.is_monotonic: raise ValueError("asof requires a sorted index") - if isinstance(self, ABCSeries): + is_series = isinstance(self, ABCSeries) + if is_series: if subset is not None: raise ValueError("subset is not valid for Series") - nulls = self.isnull() elif self.ndim > 2: raise NotImplementedError("asof is not implemented " "for {type}".format(type(self))) @@ -3747,9 +3747,9 @@ def asof(self, where, subset=None): subset = self.columns if not is_list_like(subset): subset = [subset] - nulls = self[subset].isnull().any(1) - if not is_list_like(where): + is_list = is_list_like(where) + if not is_list: start = self.index[0] if isinstance(self.index, PeriodIndex): where = Period(where, freq=self.index.freq).ordinal @@ -3758,16 +3758,26 @@ def asof(self, where, subset=None): if where < start: return np.nan - loc = self.index.searchsorted(where, side='right') - if loc > 0: - loc -= 1 - while nulls[loc] and loc > 0: - loc -= 1 - return self.iloc[loc] + # It's always much faster to use a *while* loop here for + # Series than pre-computing all the NAs. However a + # *while* loop is extremely expensive for DataFrame + # so we later pre-compute all the NAs and use the same + # code path whether *where* is a scalar or list. + # See PR: https://github.com/pandas-dev/pandas/pull/14476 + if is_series: + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + + values = self._values + while loc > 0 and isnull(values[loc]): + loc -= 1 + return values[loc] if not isinstance(where, Index): - where = Index(where) + where = Index(where) if is_list else Index([where]) + nulls = self.isnull() if is_series else self[subset].isnull().any(1) locs = self.index.asof_locs(where, ~(nulls.values)) # mask the missing @@ -3775,7 +3785,7 @@ def asof(self, where, subset=None): data = self.take(locs, is_copy=False) data.index = where data.loc[missing] = np.nan - return data + return data if is_list else data.iloc[-1] # ---------------------------------------------------------------------- # Action Methods From 050bf60edd9e551eb6927f2c167b974d1f8eade5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Oct 2016 18:18:34 -0400 Subject: [PATCH 025/183] COMPAT/TST: fix test for range testing of negative integers to neg powers xref https://github.com/numpy/numpy/pull/8127 closes #14489 Author: Jeff Reback Closes #14498 from jreback/compat and squashes the following commits: 882872e [Jeff Reback] COMPAT/TST: fix test for range testing of negative integers to neg powers --- pandas/tests/indexes/test_range.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 26d50aa55431f..38e715fce2720 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -29,12 +29,7 @@ def setUp(self): def create_index(self): return RangeIndex(5) - def test_binops(self): - ops = [operator.add, operator.sub, operator.mul, operator.floordiv, - operator.truediv, pow] - scalars = [-1, 1, 2] - idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), - RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + def check_binop(self, ops, scalars, idxs): for op in ops: for a, b in combinations(idxs, 2): result = op(a, b) @@ -46,6 +41,23 @@ def test_binops(self): expected = op(Int64Index(idx), scalar) tm.assert_index_equal(result, expected) + def test_binops(self): + ops = [operator.add, operator.sub, operator.mul, operator.floordiv, + operator.truediv] + scalars = [-1, 1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + self.check_binop(ops, scalars, idxs) + + def test_binops_pow(self): + # later versions of numpy don't allow powers of negative integers + # so test separately + # https://github.com/numpy/numpy/pull/8127 + ops = [pow] + scalars = [1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)] + self.check_binop(ops, scalars, idxs) + def test_too_many_names(self): def testit(): self.index.names = ["roger", "harold"] From 66b4c835f3fd7c9a05233603792a5ac51a04193f Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 25 Oct 2016 22:17:35 -0700 Subject: [PATCH 026/183] BLD: Support Cython 0.25 closes #14496 --- doc/source/whatsnew/v0.19.1.txt | 2 +- setup.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3ee4cc1dde92d..c5822ba5ea254 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -33,7 +33,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - +- Compat with Cython 0.25 for building (:issue:`14496`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) diff --git a/setup.py b/setup.py index 3f8667cd6fe42..a17dd502d7706 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,11 @@ def is_platform_mac(): try: if not _CYTHON_INSTALLED: raise ImportError('No supported version of Cython installed.') - from Cython.Distutils import build_ext as _build_ext + try: + from Cython.Distutils.old_build_ext import old_build_ext as _build_ext + except ImportError: + # Pre 0.25 + from Cython.Distutils import build_ext as _build_ext cython = True except ImportError: cython = False From 6130e77fb7c9d44fde5d98f9719bd67bb9ec2ade Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 26 Oct 2016 18:31:03 -0400 Subject: [PATCH 027/183] BUG: Accept unicode quotechars again in pd.read_csv Title is self-explanatory. Affects Python 2.x only. Closes #14477. Author: gfyoung Closes #14492 from gfyoung/quotechar-unicode-2.x and squashes the following commits: ec9f59a [gfyoung] BUG: Accept unicode quotechars again in pd.read_csv --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/parsers.py | 3 +++ pandas/io/tests/parser/quoting.py | 15 ++++++++++++++- pandas/parser.pyx | 3 ++- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index c5822ba5ea254..7594478ada41a 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Compat with Cython 0.25 for building (:issue:`14496`) +- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8cf04e08ab03..e0127c3544971 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1759,6 +1759,9 @@ def __init__(self, f, **kwds): self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] + if isinstance(self.quotechar, compat.text_type): + self.quotechar = str(self.quotechar) + self.escapechar = kwds['escapechar'] self.doublequote = kwds['doublequote'] self.skipinitialspace = kwds['skipinitialspace'] diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py index d0f1493be0621..765cec8243a0a 100644 --- a/pandas/io/tests/parser/quoting.py +++ b/pandas/io/tests/parser/quoting.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas.compat import StringIO +from pandas.compat import PY3, StringIO, u class QuotingTests(object): @@ -138,3 +138,16 @@ def test_double_quote(self): result = self.read_csv(StringIO(data), quotechar='"', doublequote=False) tm.assert_frame_equal(result, expected) + + def test_quotechar_unicode(self): + # See gh-14477 + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + result = self.read_csv(StringIO(data), quotechar=u('"')) + tm.assert_frame_equal(result, expected) + + # Compared to Python 3.x, Python 2.x does not handle unicode well. + if PY3: + result = self.read_csv(StringIO(data), quotechar=u('\u0394')) + tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 12525c7a9c587..0a2824e74120c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -570,7 +570,8 @@ cdef class TextReader: if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: raise TypeError('bad "quoting" value') - if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + if not isinstance(quote_char, (str, compat.text_type, + bytes)) and quote_char is not None: dtype = type(quote_char).__name__ raise TypeError('"quotechar" must be string, ' 'not {dtype}'.format(dtype=dtype)) From 6ac759d5e24f8a1b8eb9f39f08b139079cad401e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Oct 2016 18:32:41 -0400 Subject: [PATCH 028/183] BLD: fix 3.4 build for cython to 0.24.1 --- ci/requirements-3.4.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.4.build b/ci/requirements-3.4.build index 4a4bd9d433428..e6e59dcba63fe 100644 --- a/ci/requirements-3.4.build +++ b/ci/requirements-3.4.build @@ -1,3 +1,3 @@ numpy=1.8.1 -cython +cython=0.24.1 libgfortran=1.0 From 31ca7170edd1fa3cfcfd96b283d6821491324711 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Oct 2016 09:11:41 +0200 Subject: [PATCH 029/183] TST: simplify tests for GH14346 (#14502) --- pandas/tests/indexes/test_datetimelike.py | 119 +++++++++------------- 1 file changed, 48 insertions(+), 71 deletions(-) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index b04e840ffc849..68db163be6fde 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -732,30 +732,21 @@ def test_fillna_datetime64(self): dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of DatetimeIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.date_range("20160920", "20160925", freq="D") - - a = pd.date_range("20160921", "20160924", freq="D") - expected = pd.DatetimeIndex(["20160920", "20160925"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.date_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.DatetimeIndex(["20160920", "20160921"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.DatetimeIndex(["20160920", "20160921", "20160925"], - freq=None) - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestPeriodIndex(DatetimeLike, tm.TestCase): @@ -963,29 +954,23 @@ def test_no_millisecond_field(self): with self.assertRaises(AttributeError): DatetimeIndex([]).millisecond - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of Period MUST preserve frequency, but the ability - # to union results must be preserved - i = pd.period_range("20160920", "20160925", freq="D") - - a = pd.period_range("20160921", "20160924", freq="D") - expected = pd.PeriodIndex(["20160920", "20160925"], freq='D') - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.period_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.PeriodIndex(["20160920", "20160921"], freq='D') - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.PeriodIndex(["20160920", "20160921", "20160925"], - freq='D') - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq='D') + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = PeriodIndex(["20160920", "20160921"], freq='D') + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestTimedeltaIndex(DatetimeLike, tm.TestCase): @@ -1199,27 +1184,19 @@ def test_fillna_timedelta(self): [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of TimedeltaIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.timedelta_range("0 days", "5 days", freq="D") - - a = pd.timedelta_range("1 days", "4 days", freq="D") - expected = pd.TimedeltaIndex(["0 days", "5 days"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.timedelta_range("2 days", "5 days", freq="D") - b_diff = i.difference(b) - expected = pd.TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_difference = a_diff.union(b_diff) - expected = pd.TimedeltaIndex(["0 days", "1 days", "5 days"], - freq=None) - tm.assert_index_equal(union_of_difference, expected) - tm.assert_attr_equal('freq', union_of_difference, expected) + def test_difference_freq(self): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) From e7ac84d2988284604bff781c67a50974e51afdec Mon Sep 17 00:00:00 2001 From: Chris Warth Date: Thu, 27 Oct 2016 06:51:10 -0700 Subject: [PATCH 030/183] DOC: Expand on reference docs for read_json() (#14442) --- pandas/io/json.py | 91 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 1e258101a5d86..878506a6ddc05 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -123,32 +123,38 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` - orient - - * `Series` - + orient : string, + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - ``'records'`` : list like + ``[{column -> value}, ... , {column -> value}]`` + - ``'index'`` : dict like ``{index -> {column -> value}}`` + - ``'columns'`` : dict like ``{column -> {index -> value}}`` + - ``'values'`` : just the values array + + The allowed and default values depend on the value + of the `typ` parameter. + + * when ``typ == 'series'``, + + - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - - allowed values are: ``{'split','records','index'}`` - The Series index must be unique for orient ``'index'``. - * `DataFrame` + * when ``typ == 'frame'``, + - allowed orients are ``{'split','records','index', + 'columns','values'}`` - default is ``'columns'`` - - allowed values are: {'split','records','index','columns','values'} - - The DataFrame index must be unique for orients 'index' and - 'columns'. - - The DataFrame columns must be unique for orients 'index', - 'columns', and 'records'. - - * The format of the JSON string - - - split : dict like - ``{index -> [index], columns -> [columns], data -> [values]}`` - - records : list like - ``[{column -> value}, ... , {column -> value}]`` - - index : dict like ``{index -> {column -> value}}`` - - columns : dict like ``{column -> {index -> value}}`` - - values : just the values array + - The DataFrame index must be unique for orients ``'index'`` and + ``'columns'``. + - The DataFrame columns must be unique for orients ``'index'``, + ``'columns'``, and ``'records'``. typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True @@ -197,7 +203,48 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Returns ------- - result : Series or DataFrame + result : Series or DataFrame, depending on the value of `typ`. + + See Also + -------- + DataFrame.to_json + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + + Encoding/decoding a Dataframe using ``'split'`` formatted JSON: + + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + >>> pd.read_json(_, orient='split') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> pd.read_json(_, orient='index') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> pd.read_json(_, orient='records') + col 1 col 2 + 0 a b + 1 c d """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, From d7fb5bd310edcab9875c1f9339b62e92baae8291 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Oct 2016 15:53:09 +0200 Subject: [PATCH 031/183] BUG: fix DatetimeIndex._maybe_cast_slice_bound for empty index (GH14354) (#14501) --- doc/source/whatsnew/v0.19.1.txt | 4 +++- pandas/tseries/index.py | 5 +++-- pandas/tseries/tests/test_timeseries.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 7594478ada41a..a81ab6ed0311c 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -48,6 +48,7 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). - Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) @@ -78,4 +79,5 @@ Bug Fixes -- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` + is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f68750e242f1f..70e2d2c121773 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1453,8 +1453,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) # because label may be passed to searchsorted - # the bounds need swapped if index is reverse sorted - if self.is_monotonic_decreasing: + # the bounds need swapped if index is reverse sorted and has a + # length (is_monotonic_decreasing gives True for empty index) + if self.is_monotonic_decreasing and len(self): return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c13805d383e5d..aa8a5d10cd9d3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3911,6 +3911,18 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.ix[::0]) + def test_slice_bounds_empty(self): + # GH 14354 + empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') + + right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') + exp = Timestamp('2015-01-02 23:59:59.999999999') + self.assertEqual(right, exp) + + left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') + exp = Timestamp('2015-01-02 00:00:00') + self.assertEqual(left, exp) + class TestDatetime64(tm.TestCase): """ From 096d8866a90c8cbb44ab8243320e811fc24190bd Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 28 Oct 2016 04:37:55 -0400 Subject: [PATCH 032/183] MAINT: Expand lint for *.py (#14516) --- ci/lint.sh | 14 ++++---------- pandas/core/groupby.py | 2 +- pandas/core/internals.py | 3 ++- pandas/io/parsers.py | 10 +++++----- pandas/io/tests/parser/common.py | 4 ++-- pandas/msgpack/__init__.py | 14 ++++++-------- pandas/tests/indexes/test_base.py | 3 +-- pandas/util/testing.py | 2 +- 8 files changed, 22 insertions(+), 30 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index a866b04445f96..115a2cdaf7899 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -7,16 +7,10 @@ source activate pandas RET=0 if [ "$LINT" ]; then - echo "Linting" - for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.py' - if [ $? -ne "0" ]; then - RET=1 - fi - - done + # pandas/rpy is deprecated and will be removed. + # pandas/src is C code, so no need to search there. + echo "Linting *.py" + flake8 pandas --filename '*.py' --exclude pandas/rpy,pandas/src echo "Linting *.py DONE" echo "Linting *.pyx" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2a7f896e1b871..afddb86988970 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -6,7 +6,7 @@ import warnings import copy -from pandas.compat import( +from pandas.compat import ( zip, range, long, lzip, callable, map ) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac29..d9d4bb0d14228 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1147,8 +1147,9 @@ def get_result(other): def handle_error(): if raise_on_error: + # The 'detail' variable is defined in outer scope. raise TypeError('Could not operate %s with block values %s' % - (repr(other), str(detail))) + (repr(other), str(detail))) # noqa else: # return the values result = np.empty(values.shape, dtype='O') diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e0127c3544971..9e5fcd406a750 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2194,16 +2194,16 @@ def _handle_usecols(self, columns, usecols_key): usecols_key is used if there are string usecols. """ if self.usecols is not None: - if any([isinstance(u, string_types) for u in self.usecols]): + if any([isinstance(col, string_types) for col in self.usecols]): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(usecols_key.index(u)) + for col in self.usecols: + if isinstance(col, string_types): + col_indices.append(usecols_key.index(col)) else: - col_indices.append(u) + col_indices.append(col) else: col_indices = self.usecols diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0219e16391be8..0364b3bf42fff 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -17,8 +17,8 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex from pandas import compat -from pandas.compat import(StringIO, BytesIO, PY3, - range, lrange, u) +from pandas.compat import (StringIO, BytesIO, PY3, + range, lrange, u) from pandas.io.common import DtypeWarning, EmptyDataError, URLError from pandas.io.parsers import TextFileReader, TextParser diff --git a/pandas/msgpack/__init__.py b/pandas/msgpack/__init__.py index 0c2370df936a4..33d60a12ef0a3 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/msgpack/__init__.py @@ -1,11 +1,10 @@ # coding: utf-8 -# flake8: noqa - -from pandas.msgpack._version import version -from pandas.msgpack.exceptions import * from collections import namedtuple +from pandas.msgpack.exceptions import * # noqa +from pandas.msgpack._version import version # noqa + class ExtType(namedtuple('ExtType', 'code data')): """ExtType represents ext type in msgpack.""" @@ -18,11 +17,10 @@ def __new__(cls, code, data): raise ValueError("code must be 0~127") return super(ExtType, cls).__new__(cls, code, data) +import os # noqa -import os -from pandas.msgpack._packer import Packer -from pandas.msgpack._unpacker import unpack, unpackb, Unpacker - +from pandas.msgpack._packer import Packer # noqa +from pandas.msgpack._unpacker import unpack, unpackb, Unpacker # noqa def pack(o, stream, **kwargs): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 21471b1883209..b839ed6331457 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1576,11 +1576,10 @@ def test_string_index_repr(self): # py3/py2 repr can differ because of "u" prefix # which also affects to displayed element size - # suppress flake8 warnings if PY3: coerce = lambda x: x else: - coerce = unicode + coerce = unicode # noqa # short idx = pd.Index(['a', 'bb', 'ccc']) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 57bb01e5e0406..05517bf6cf53a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -35,7 +35,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat -from pandas.compat import( +from pandas.compat import ( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, raise_with_traceback, httplib, is_platform_windows, is_platform_32bit, PY3 From 7f5a45c1b388b3f7f309f82bfa0733b7b9980c3a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 29 Oct 2016 15:24:19 +0200 Subject: [PATCH 033/183] BUG/ERR: raise correct error when sql driver is not installed (#14527) When the driver was not installed, but sqlalchemy itself was, when passing a URI string, you got an error indicating that SQLAlchemy was not installed, instead of the driver not being installed. This was because the import error for the driver was captured as import error for sqlalchemy. --- pandas/io/sql.py | 5 +++-- pandas/io/tests/test_sql.py | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 47642c2e2bc28..c9f8d32e1b504 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -507,10 +507,11 @@ def _engine_builder(con): if isinstance(con, string_types): try: import sqlalchemy - con = sqlalchemy.create_engine(con) - return con except ImportError: _SQLALCHEMY_INSTALLED = False + else: + con = sqlalchemy.create_engine(con) + return con return con diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index af8989baabbc0..e9d19bbd8be66 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -944,7 +944,7 @@ def test_sqlalchemy_type_mapping(self): self.assertTrue(isinstance( table.table.c['time'].type, sqltypes.DateTime)) - def test_to_sql_read_sql_with_database_uri(self): + def test_database_uri_string(self): # Test read_sql and .to_sql method with a database URI (GH10654) test_frame1 = self.test_frame1 @@ -963,6 +963,12 @@ def test_to_sql_read_sql_with_database_uri(self): tm.assert_frame_equal(test_frame1, test_frame3) tm.assert_frame_equal(test_frame1, test_frame4) + # using driver that will not be installed on Travis to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with tm.assertRaisesRegexp(ImportError, "pg8000"): + sql.read_sql("select * from table", db_uri) + def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() From 1ce62992ac086d40353228902048a13e8765ceb5 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Mon, 31 Oct 2016 08:23:39 -0400 Subject: [PATCH 034/183] DOC: Simplify the gbq integration testing procedure for contributors (#14541) --- ci/travis_encrypt_gbq.sh | 11 +++++------ ci/travis_gbq_config.txt | 1 - ci/travis_process_gbq_encryption.sh | 6 ++++-- doc/source/contributing.rst | 30 +++++++++++++++-------------- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh index 719db67f384e0..e404ca73a405e 100755 --- a/ci/travis_encrypt_gbq.sh +++ b/ci/travis_encrypt_gbq.sh @@ -1,11 +1,10 @@ #!/bin/bash GBQ_JSON_FILE=$1 -GBQ_PROJECT_ID=$2 -if [[ $# -ne 2 ]]; then +if [[ $# -ne 1 ]]; then echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - " " + "" exit 1 fi @@ -23,9 +22,9 @@ echo "Encrypting $GBQ_JSON_FILE..." read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); -echo "Adding your secure key and project id to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY\n"\ -"GBQ_PROJECT_ID='$GBQ_PROJECT_ID'" > travis_gbq_config.txt +echo "Adding your secure key to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ +> travis_gbq_config.txt echo "Done. Removing file $GBQ_JSON_FILE" rm $GBQ_JSON_FILE diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt index 3b68d62f177cc..0b28cdedbd0d7 100644 --- a/ci/travis_gbq_config.txt +++ b/ci/travis_gbq_config.txt @@ -1,3 +1,2 @@ TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key -GBQ_PROJECT_ID='pandas-travis' diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index 7ff4c08f78e37..9967d40e49f0a 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -2,10 +2,12 @@ source ci/travis_gbq_config.txt -if [[ -n ${!TRAVIS_IV_ENV} ]]; then +if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then + echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; +elif [[ -n ${!TRAVIS_IV_ENV} ]]; then openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID=$GBQ_PROJECT_ID; + export GBQ_PROJECT_ID='pandas-travis'; echo 'Successfully decrypted gbq credentials' fi diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index a8a47a9d979c0..44ee6223d5ee1 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -643,20 +643,22 @@ on Travis-CI and are only accessible from the pandas-dev/pandas repository. The credentials won't be available on forks of pandas. Here are the steps to run gbq integration tests on a forked repository: -#. First, complete all the steps in the `Encrypting Files Prerequisites - `__ section. -#. Sign into `Travis `__ using your GitHub account. -#. Enable your forked repository of pandas for testing in `Travis - `__. -#. Run the following command from terminal where the current working directory - is the ``ci`` folder:: - - ./travis_encrypt_gbq.sh - -#. Create a new branch from the branch used in your pull request. Commit the - encrypted file called ``travis_gbq.json.enc`` as well as the file - ``travis_gbq_config.txt``, in an otherwise empty commit. DO NOT commit the - ``*.json`` file which contains your unencrypted private key. +#. Go to `Travis CI `__ and sign in with your GitHub + account. +#. Click on the ``+`` icon next to the ``My Repositories`` list and enable + Travis builds for your fork. +#. Click on the gear icon to edit your travis build, and add two environment + variables: + + - ``GBQ_PROJECT_ID`` with the value being the ID of your BigQuery project. + + - ``SERVICE_ACCOUNT_KEY`` with the value being the contents of the JSON key + that you downloaded for your service account. Use single quotes around + your JSON key to ensure that it is treated as a string. + + For both environment variables, keep the "Display value in build log" option + DISABLED. These variables contain sensitive data and you do not want their + contents being exposed in build logs. #. Your branch should be tested automatically once it is pushed. You can check the status by visiting your Travis branches page which exists at the following location: https://travis-ci.org/your-user-name/pandas/branches . From 47f117d18a99f8bbaf2ecbc7829d198d304b8c2a Mon Sep 17 00:00:00 2001 From: Piotr Chromiec Date: Mon, 31 Oct 2016 13:24:05 +0100 Subject: [PATCH 035/183] BUG: tseries ceil doc fix (#14543) --- pandas/tseries/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 96213a4aec34d..4645ae24684ff 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -100,7 +100,7 @@ def round(self, freq, *args, **kwargs): def floor(self, freq): return self._round(freq, np.floor) - @Appender(_round_doc % "floor") + @Appender(_round_doc % "ceil") def ceil(self, freq): return self._round(freq, np.ceil) From b08811220459ac8271dd904a94e82278bd4066c4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 31 Oct 2016 16:39:59 -0400 Subject: [PATCH 036/183] BUG: Don't parse inline quotes in skipped lines (#14514) Closes gh-14459. --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/tests/parser/skiprows.py | 8 ++++++ pandas/src/parser/tokenizer.c | 45 ++++++++++++++++++++---------- pandas/src/parser/tokenizer.h | 7 +++-- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a81ab6ed0311c..ab999643d575b 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Compat with Cython 0.25 for building (:issue:`14496`) +- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index c9f50dec6c01e..9f01adb6fabcb 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self): skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) + + def test_skiprows_infield_quote(self): + # see gh-14459 + data = 'a"\nb"\na\n1' + expected = DataFrame({'a': [1]}) + + df = self.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(df, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index af85b7b894d26..748edc7fcacc5 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -478,9 +478,10 @@ static int end_line(parser_t *self) { } } - if (self->state == SKIP_LINE || \ - self->state == QUOTE_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE + if (self->state == START_FIELD_IN_SKIP_LINE || \ + self->state == IN_FIELD_IN_SKIP_LINE || \ + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE ) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count @@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit) switch(self->state) { - case SKIP_LINE: - TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + case START_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - case QUOTE_IN_SKIP_LINE: + case IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { if (self->doublequote) { - self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE; + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } } break; - case QUOTE_IN_QUOTE_IN_SKIP_LINE: + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } break; @@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; if (IS_TERMINATOR(c)) { END_LINE(); diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 8f7ae436bb7b7..487c1265d9358 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -123,9 +123,10 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, - SKIP_LINE, - QUOTE_IN_SKIP_LINE, - QUOTE_IN_QUOTE_IN_SKIP_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, FINISHED } ParserState; From 60a335e457bc40d009d5be99b384a62fa34ba3fc Mon Sep 17 00:00:00 2001 From: "Brandon M. Burroughs" Date: Mon, 31 Oct 2016 16:53:51 -0400 Subject: [PATCH 037/183] BUG: Dataframe constructor when given dict with None value (#14392) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/series.py | 4 ++-- pandas/tests/frame/test_constructors.py | 8 ++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index ab999643d575b..cb02c3a5009ab 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -41,6 +41,7 @@ Bug Fixes - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) +- Bug in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c6b13885dd01..188204d83d985 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2915,8 +2915,8 @@ def create_from_value(value, index, dtype): return subarr - # scalar like - if subarr.ndim == 0: + # scalar like, GH + if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d21db5ba52a45..e55ba3e161ed9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -259,6 +259,14 @@ def test_constructor_dict(self): frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) self.assert_index_equal(frame.index, Index([], dtype=np.int64)) + # GH 14381 + # Dict with None value + frame_none = DataFrame(dict(a=None), index=[0]) + frame_none_list = DataFrame(dict(a=[None]), index=[0]) + tm.assert_equal(frame_none.get_value(0, 'a'), None) + tm.assert_equal(frame_none_list.get_value(0, 'a'), None) + tm.assert_frame_equal(frame_none, frame_none_list) + # GH10856 # dict with scalar values should raise error, even if columns passed with tm.assertRaises(ValueError): From e5443622e413138f626db64825adb0c9e3efa4a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2016 17:15:06 +0100 Subject: [PATCH 038/183] Update ISSUE_TEMPLATE: be more explicit on where to paste the output of show_versions --- .github/ISSUE_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6f91eba1ad239..c7d731249f9cf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -10,6 +10,6 @@ #### Output of ``pd.show_versions()``
-# Paste the output here +# Paste the output here pd.show_versions() here
From 8b805626cfd5af4d8c93c4db44b59fa85ee1b091 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 2 Nov 2016 05:57:41 -0400 Subject: [PATCH 039/183] asv compat for py3 --- asv_bench/benchmarks/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0f9689dadcbb0..2e394ed4268f3 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -148,12 +148,12 @@ class to_numeric(object): N = 500000 data_dict = { - 'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)), - 'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)), + 'string-int': (['1'] * (N // 2)) + ([2] * (N // 2)), + 'string-nint': (['-1'] * (N // 2)) + ([2] * (N // 2)), 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)), - 'int-list': ([1] * (N / 2)) + ([2] * (N / 2)), + 'string-float': (['1.1'] * (N // 2)) + ([2] * (N // 2)), + 'int-list': ([1] * (N // 2)) + ([2] * (N // 2)), 'int32': np.repeat(np.int32(1), N) } From eb7bd993da0b8cbfbbe716ca67a4a8745de41e23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 12:27:40 +0100 Subject: [PATCH 040/183] BUG: don't close user-provided file handles in C parser (GH14418) (#14520) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/parsers.py | 2 ++ pandas/io/tests/parser/common.py | 23 +++++++++++++++++++++++ pandas/parser.pyx | 9 ++++----- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index cb02c3a5009ab..a604ead87b2ab 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -38,6 +38,7 @@ Bug Fixes - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) +- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9e5fcd406a750..090a21632cddb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1456,6 +1456,8 @@ def __init__(self, src, **kwds): def close(self): for f in self.handles: f.close() + + # close additional handles opened by C parser (for compression) try: self._reader.close() except: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0364b3bf42fff..3be02c55ea10a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1602,3 +1602,26 @@ def test_internal_eof_byte(self): expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) + + def test_file_handles(self): + # GH 14418 - don't close user provided file handles + + fh = StringIO('a,b\n1,2') + self.read_csv(fh) + self.assertFalse(fh.closed) + + with open(self.csv1, 'r') as f: + self.read_csv(f) + self.assertFalse(f.closed) + + # mmap not working with python engine + if self.engine != 'python': + + import mmap + with open(self.csv1, 'r') as f: + m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + self.read_csv(m) + # closed attribute new in python 3.2 + if PY3: + self.assertFalse(m.closed) + m.close() diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 0a2824e74120c..93a494c176b99 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -272,7 +272,7 @@ cdef class TextReader: parser_t *parser object file_handle, na_fvalues object true_values, false_values - object dsource + object handle bint na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks @@ -554,9 +554,9 @@ cdef class TextReader: def close(self): # we need to properly close an open derived # filehandle here, e.g. and UTFRecoder - if self.dsource is not None: + if self.handle is not None: try: - self.dsource.close() + self.handle.close() except: pass @@ -641,6 +641,7 @@ cdef class TextReader: else: raise ValueError('Unrecognized compression type: %s' % self.compression) + self.handle = source if isinstance(source, basestring): if not isinstance(source, bytes): @@ -684,8 +685,6 @@ cdef class TextReader: raise IOError('Expected file path name or file-like object,' ' got %s type' % type(source)) - self.dsource = source - cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] From 52f31d470d779204e4c1388cdb56351c68332c3f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 14:01:04 +0100 Subject: [PATCH 041/183] BUG: DataFrame.quantile with NaNs (GH14357) (#14536) --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/core/internals.py | 51 ++++++++++----- pandas/tests/frame/test_quantile.py | 97 ++++++++++++++++++++++++++++ pandas/tests/series/test_quantile.py | 32 +++++++++ 4 files changed, 166 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a604ead87b2ab..5de59ed373523 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -66,7 +66,7 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - +- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d9d4bb0d14228..43beefffd448e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,7 +6,6 @@ from collections import defaultdict import numpy as np -from numpy import percentile as _quantile from pandas.core.base import PandasObject @@ -1316,16 +1315,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) - mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): - # even though this could be a 2-d mask it appears - # as a 1-d result - mask = mask.reshape(values.shape) - result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) - values = _block_shape(values[~mask], ndim=self.ndim) - if self.ndim > 1: - values = values.reshape(result_shape) + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_scalar(q): + return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + def _nanpercentile(values, q, axis, **kw): + + mask = isnull(self.values) + if not is_scalar(mask) and mask.any(): + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) from pandas import Float64Index is_empty = values.shape[axis] == 0 @@ -1344,13 +1365,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: try: - result = _quantile(values, np.array(qs) * 100, - axis=axis, **kw) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) except ValueError: # older numpies don't handle an array for q - result = [_quantile(values, q * 100, - axis=axis, **kw) for q in qs] + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] result = np.array(result, copy=False) if self.ndim > 1: @@ -1369,7 +1390,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: result = np.array([self._na_value] * len(self)) else: - result = _quantile(values, qs * 100, axis=axis, **kw) + result = _nanpercentile(values, qs * 100, axis=axis, **kw) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 52e8697abe850..22414a6ba8a53 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -262,6 +262,11 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + def test_quantile_invalid(self): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: @@ -340,3 +345,95 @@ def test_quantile_box(self): pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df['b'] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, + index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=['a'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame({'a': [pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03')], + 'b': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], + name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], + columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty(self): + + # floats + df = DataFrame(columns=['a', 'b'], dtype='float64') + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=['a', 'b'], dtype='int64') + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=['a', 'b'], dtype='datetime64') + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 7d2517987e526..76db6c90a685f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -184,3 +184,35 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype='float64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype='int64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype='datetime64[ns]') + + res = s.quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp) From 1d951794187b72841d59b3dfde0a98309a64dec7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 16:16:15 +0100 Subject: [PATCH 042/183] PERF: casting loc to labels dtype before searchsorted (#14551) --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/indexes/multi.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 5de59ed373523..80a3e38fd5edd 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -24,7 +24,7 @@ Performance Improvements - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) - +- Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index a9f452db69659..f9576d92d8a49 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1907,6 +1907,13 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): return np.array(labels == loc, dtype=bool) else: # sorted, so can return slice object -> view + try: + loc = labels.dtype.type(loc) + except TypeError: + # this occurs when loc is a slice (partial string indexing) + # but the TypeError raised by searchsorted in this case + # is catched in Index._has_valid_type() + pass i = labels.searchsorted(loc, side='left') j = labels.searchsorted(loc, side='right') return slice(i, j) From 093aa8231eae92ff6cf7ef9564d62289b458aaff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 10:36:10 +0100 Subject: [PATCH 043/183] DEPR: add deprecation warning for com.array_equivalent (#14567) pandas.core.common.array_equivalent was removed without deprecation warning. This commits adds it back to the core.common namespace with deprecation warning --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/api/tests/test_api.py | 7 +++++++ pandas/core/common.py | 9 +++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 80a3e38fd5edd..cbdacb95a3d4a 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -34,7 +34,7 @@ Bug Fixes ~~~~~~~~~ - Compat with Cython 0.25 for building (:issue:`14496`) - +- Added back ``pandas.core.common.array_equivalent`` with a deprecation warning (:issue:`14555`). - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index d4d8b7e4e9747..49aa31c375e25 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import numpy as np + import pandas as pd from pandas.core import common as com from pandas import api @@ -184,6 +186,11 @@ def test_deprecation_core_common(self): for t in self.allowed: self.check_deprecation(getattr(com, t), getattr(types, t)) + def test_deprecation_core_common_array_equivalent(self): + + with tm.assert_produces_warning(DeprecationWarning): + com.array_equivalent(np.array([1, 2]), np.array([1, 2])) + def test_deprecation_core_common_moved(self): # these are in pandas.types.common diff --git a/pandas/core/common.py b/pandas/core/common.py index 341bd3b4cc845..295947bbc1166 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -64,6 +64,15 @@ def wrapper(*args, **kwargs): setattr(m, t, outer(t)) +# deprecate array_equivalent + +def array_equivalent(*args, **kwargs): + warnings.warn("'pandas.core.common.array_equivalent' is deprecated and " + "is no longer public API", DeprecationWarning, stacklevel=2) + from pandas.types import missing + return missing.array_equivalent(*args, **kwargs) + + class PandasError(Exception): pass From 7f0c4e084dd5fe3c010fd4f07633368004c6a0f5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 12:10:43 +0100 Subject: [PATCH 044/183] DOC: rst fixes --- doc/source/ecosystem.rst | 2 +- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.13.0.txt | 2 +- doc/source/whatsnew/v0.19.1.txt | 6 +++--- pandas/core/generic.py | 2 ++ 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index a37b1e89c7cc3..087b265ee83f2 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -143,7 +143,7 @@ both "column wise min/max and global min/max coloring." API ----- -`pandas-datareader `__ +`pandas-datareader `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``pandas-datareader`` is a remote data access library for pandas. ``pandas.io`` from pandas < 0.17.0 is now refactored/split-off to and importable from ``pandas_datareader`` (PyPI:``pandas-datareader``). Many/most of the supported APIs have at least a documentation paragraph in the `pandas-datareader docs `_: diff --git a/doc/source/io.rst b/doc/source/io.rst index ae71587c8b46b..ba1bd328d2991 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2789,7 +2789,7 @@ both on the writing (serialization), and reading (deserialization). | 0.17 / Python 3 | >=0.18 / any Python | +----------------------+------------------------+ | 0.18 | >= 0.18 | - +======================+========================+ + +----------------------+------------------------+ Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index 0944d849cfafd..6ecd4b487c798 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -600,7 +600,7 @@ Enhancements .. ipython:: python t = Timestamp('20130101 09:01:02') - t + pd.datetools.Nano(123) + t + pd.tseries.offsets.Nano(123) - A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs` for more. diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index cbdacb95a3d4a..19964a499c4e4 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -22,8 +22,8 @@ Performance Improvements - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) -- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) +- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). @@ -50,7 +50,7 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). -- Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Bug in union of differences from a ``DatetimeIndex``; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) - Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 037ab900e6150..8e18b65e80385 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4008,6 +4008,8 @@ def asfreq(self, freq, method=None, how=None, normalize=False): ------- converted : type of caller + Notes + ----- To learn more about the frequency strings, please see `this link `__. """ From 252526cc0f197fb4c6b93cad41ca7cbcc5a82ed7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 12:12:19 +0100 Subject: [PATCH 045/183] BUG/API: Index.append with mixed object/Categorical indices (#14545) * BUG/API: Index.append with mixed object/Categorical indices * Only coerce to object if the calling index is not categorical * Add test for the df.info() case (GH14298) --- pandas/indexes/base.py | 8 ++++---- pandas/tests/frame/test_repr_info.py | 8 ++++++++ pandas/tests/indexes/test_category.py | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 4d2dcd259e623..54eaf86315a88 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1464,13 +1464,13 @@ def append(self, other): names = set([obj.name for obj in to_concat]) name = None if len(names) > 1 else self.name - typs = _concat.get_dtype_kinds(to_concat) - - if 'category' in typs: - # if any of the to_concat is category + if self.is_categorical(): + # if calling index is category, don't check dtype of others from pandas.indexes.category import CategoricalIndex return CategoricalIndex._append_same_dtype(self, to_concat, name) + typs = _concat.get_dtype_kinds(to_concat) + if len(typs) == 1: return self._append_same_dtype(to_concat, name=name) return _concat._concat_index_asobject(to_concat, name=name) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 5e5e9abda1200..12cd62f8b4cc0 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -405,3 +405,11 @@ def memory_usage(f): # high upper bound self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000) + + def test_info_categorical(self): + # GH14298 + idx = pd.CategoricalIndex(['a', 'b']) + df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 9f8405bcc2e1e..c76f5ff22c534 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -278,6 +278,11 @@ def test_append(self): # invalid objects self.assertRaises(TypeError, lambda: ci.append(Index(['a', 'd']))) + # GH14298 - if base object is not categorical -> coerce to object + result = Index(['c', 'a']).append(ci) + expected = Index(list('caaabbca')) + tm.assert_index_equal(result, expected, exact=True) + def test_insert(self): ci = self.create_index() From e1cdc4b83d7e970e9683056be722d5fa2a00fa70 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 15:17:42 +0100 Subject: [PATCH 046/183] DOC: update whatsnew/release notes for 0.19.1 (#14573) --- doc/source/release.rst | 44 ++++++++++++++++++++++++++++ doc/source/whatsnew/v0.19.1.txt | 51 +++++++++------------------------ 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d210065f04459..622e9a53ff8f0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,50 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org + +pandas 0.19.1 +------------- + +**Release date:** November 3, 2016 + +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. + +See the :ref:`v0.19.1 Whatsnew ` page for an overview of all +bugs that have been fixed in 0.19.1. + +Thanks +~~~~~~ + +- Adam Chainz +- Anthonios Partheniou +- Arash Rouhani +- Ben Kandel +- Brandon M. Burroughs +- Chris +- chris-b1 +- Chris Warth +- David Krych +- dubourg +- gfyoung +- Iván Vallés Pérez +- Jeff Reback +- Joe Jevnik +- Jon M. Mease +- Joris Van den Bossche +- Josh Owen +- Keshav Ramaswamy +- Larry Ren +- mattrijk +- Michael Felt +- paul-mannino +- Piotr Chromiec +- Robert Bradshaw +- Sinhrks +- Thiago Serafim +- Tom Bird + + pandas 0.19.0 ------------- diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 19964a499c4e4..db5bd22393e64 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -1,15 +1,12 @@ .. _whatsnew_0191: -v0.19.1 (????, 2016) ---------------------- +v0.19.1 (November 3, 2016) +-------------------------- -This is a minor bug-fix release from 0.19.0 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. We recommend that all users upgrade to this version. -Highlights include: - - .. contents:: What's new in v0.19.1 :local: :backlinks: none @@ -21,52 +18,38 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) -- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Fixed performance regression in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). - .. _whatsnew_0191.bug_fixes: Bug Fixes ~~~~~~~~~ +- Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`) - Compat with Cython 0.25 for building (:issue:`14496`) +- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). +- Fixed regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). +- Fixed regression in ``Index.difference`` where the ``freq`` of a ``DatetimeIndex`` was incorrectly set (:issue:`14323`) - Added back ``pandas.core.common.array_equivalent`` with a deprecation warning (:issue:`14555`). - - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) -- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). +- Fixed regression in ``Index.append`` when categorical indices were appended (:issue:`14545`). +- Fixed regression in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) +- Fixed regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) - -- Bug in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) - - - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) - - - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). -- Bug in union of differences from a ``DatetimeIndex``; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) -- Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) - - - Bug in ``Series.__setitem__`` which allowed mutating read-only arrays (:issue:`14359`). - - -- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) - Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) - - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) - - - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) -- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) @@ -74,13 +57,5 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - - - - - - - - - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file From 7a2bcb6605bacea858ec14cfac424898deb568b3 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 4 Nov 2016 07:33:12 -0400 Subject: [PATCH 047/183] DOC: Update GitHub org from pydata to pandas-dev (#14575) Revert pydata/pandas-datareader references --- .github/CONTRIBUTING.md | 20 ++++++++++---------- RELEASE.md | 2 +- asv_bench/asv.conf.json | 2 +- doc/source/contributing.rst | 2 +- doc/source/html-styling.ipynb | 4 ++-- doc/source/index.rst.template | 4 ++-- doc/source/remote_data.rst | 2 +- doc/source/whatsnew/v0.14.0.txt | 4 ++-- doc/source/whatsnew/v0.17.1.txt | 2 +- doc/source/whatsnew/v0.4.x.txt | 8 ++++---- doc/source/whatsnew/v0.5.0.txt | 4 ++-- pandas/io/data.py | 2 +- pandas/io/wb.py | 2 +- pandas/tslib.pyx | 4 ++-- 14 files changed, 31 insertions(+), 31 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index cf604822d6eea..7898822e0e11d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -6,7 +6,7 @@ Where to start? All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. -If you are simply looking to start working with the *pandas* codebase, navigate to the [GitHub "issues" tab](https://github.com/pydata/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pydata/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pydata/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. +If you are simply looking to start working with the *pandas* codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. Or maybe through using *pandas* you have an idea of you own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! @@ -49,7 +49,7 @@ Now that you have an issue you want to fix, enhancement to add, or documentation To the new user, working with Git is one of the more daunting aspects of contributing to *pandas*. It can very quickly become overwhelming, but sticking to the guidelines below will help keep the process straightforward and mostly trouble free. As always, if you are having difficulties please feel free to ask for help. -The code is hosted on [GitHub](https://www.github.com/pydata/pandas). To contribute you will need to sign up for a [free GitHub account](https://github.com/signup/free). We use [Git](http://git-scm.com/) for version control to allow many people to work together on the project. +The code is hosted on [GitHub](https://www.github.com/pandas-dev/pandas). To contribute you will need to sign up for a [free GitHub account](https://github.com/signup/free). We use [Git](http://git-scm.com/) for version control to allow many people to work together on the project. Some great resources for learning Git: @@ -63,11 +63,11 @@ Some great resources for learning Git: ### Forking -You will need your own fork to work on the code. Go to the [pandas project page](https://github.com/pydata/pandas) and hit the `Fork` button. You will want to clone your fork to your machine: +You will need your own fork to work on the code. Go to the [pandas project page](https://github.com/pandas-dev/pandas) and hit the `Fork` button. You will want to clone your fork to your machine: git clone git@github.com:your-user-name/pandas.git pandas-yourname cd pandas-yourname - git remote add upstream git://github.com/pydata/pandas.git + git remote add upstream git://github.com/pandas-dev/pandas.git This creates the directory pandas-yourname and connects your repository to the upstream (main project) *pandas* repository. @@ -268,7 +268,7 @@ and make these changes with: pep8radius master --diff --in-place -Alternatively, use the [flake8](http://pypi.python.org/pypi/flake8) tool for checking the style of your code. Additional standards are outlined on the [code style wiki page](https://github.com/pydata/pandas/wiki/Code-Style-and-Conventions). +Alternatively, use the [flake8](http://pypi.python.org/pypi/flake8) tool for checking the style of your code. Additional standards are outlined on the [code style wiki page](https://github.com/pandas-dev/pandas/wiki/Code-Style-and-Conventions). Please try to maintain backward compatibility. *pandas* has lots of users with lots of existing code, so don't break it if at all possible. If you think breakage is required, clearly state why as part of the pull request. Also, be careful when changing method signatures and add deprecation warnings where needed. @@ -282,7 +282,7 @@ Like many packages, *pandas* uses the [Nose testing system](https://nose.readthe #### Writing tests -All tests should go into the `tests` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for inspiration. If your test requires working with files or network connectivity, there is more information on the [testing page](https://github.com/pydata/pandas/wiki/Testing) of the wiki. +All tests should go into the `tests` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for inspiration. If your test requires working with files or network connectivity, there is more information on the [testing page](https://github.com/pandas-dev/pandas/wiki/Testing) of the wiki. The `pandas.util.testing` module has many special `assert` functions that make it easier to make statements about whether Series or DataFrame objects are equivalent. The easiest way to verify that your code is correct is to explicitly construct the result you expect, then compare the actual result to the expected correct result: @@ -378,7 +378,7 @@ This will check out the master revision and run the suite on both master and you You can run specific benchmarks using the `-r` flag, which takes a regular expression. -See the [performance testing wiki](https://github.com/pydata/pandas/wiki/Performance-Testing) for information on how to write a benchmark. +See the [performance testing wiki](https://github.com/pandas-dev/pandas/wiki/Performance-Testing) for information on how to write a benchmark. ### Documenting your code @@ -390,7 +390,7 @@ If your code is an enhancement, it is most likely necessary to add usage example .. versionadded:: 0.17.0 ``` -This will put the text *New in version 0.17.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method ([example](https://github.com/pydata/pandas/blob/v0.16.2/pandas/core/generic.py#L1959)) or a new keyword argument ([example](https://github.com/pydata/pandas/blob/v0.16.2/pandas/core/frame.py#L1171)). +This will put the text *New in version 0.17.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method ([example](https://github.com/pandas-dev/pandas/blob/v0.16.2/pandas/core/generic.py#L1959)) or a new keyword argument ([example](https://github.com/pandas-dev/pandas/blob/v0.16.2/pandas/core/frame.py#L1171)). Contributing your changes to *pandas* ------------------------------------- @@ -466,8 +466,8 @@ If you added the upstream repository as described above you will see something l origin git@github.com:yourname/pandas.git (fetch) origin git@github.com:yourname/pandas.git (push) - upstream git://github.com/pydata/pandas.git (fetch) - upstream git://github.com/pydata/pandas.git (push) + upstream git://github.com/pandas-dev/pandas.git (fetch) + upstream git://github.com/pandas-dev/pandas.git (push) Now your code is on GitHub, but it is not yet a part of the *pandas* project. For that to happen, a pull request needs to be submitted on GitHub. diff --git a/RELEASE.md b/RELEASE.md index 23c1817b7647c..a181412be2719 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,4 +3,4 @@ Release Notes The list of changes to pandas between each release can be found [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full -details, see the commit logs at http://github.com/pydata/pandas. +details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index f5fa849464881..155deb5bdbd1f 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -21,7 +21,7 @@ "environment_type": "conda", // the base URL to show a commit for the project. - "show_commit_url": "https://github.com/pydata/pandas/commit/", + "show_commit_url": "https://github.com/pandas-dev/pandas/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 44ee6223d5ee1..38718bc5ca19a 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -27,7 +27,7 @@ about it! Feel free to ask questions on the `mailing list `_ or on `Gitter -`_. +`_. Bug reports and enhancement requests ==================================== diff --git a/doc/source/html-styling.ipynb b/doc/source/html-styling.ipynb index e55712b2bb4f6..1a97378fd30b1 100644 --- a/doc/source/html-styling.ipynb +++ b/doc/source/html-styling.ipynb @@ -6,9 +6,9 @@ "source": [ "*New in version 0.17.1*\n", "\n", - "

*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your [feedback](https://github.com/pydata/pandas/issues).*

\n", + "

*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your [feedback](https://github.com/pandas-dev/pandas/issues).*

\n", "\n", - "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pydata/pandas/blob/master/doc/source/html-styling.ipynb).\n", + "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/html-styling.ipynb).\n", "\n", "You can apply **conditional formatting**, the visual styling of a DataFrame\n", "depending on the data within, by using the ``DataFrame.style`` property.\n", diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 1996ad75ea92a..67072ff9fb224 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -14,9 +14,9 @@ pandas: powerful Python data analysis toolkit **Binary Installers:** http://pypi.python.org/pypi/pandas -**Source Repository:** http://github.com/pydata/pandas +**Source Repository:** http://github.com/pandas-dev/pandas -**Issues & Ideas:** https://github.com/pydata/pandas/issues +**Issues & Ideas:** https://github.com/pandas-dev/pandas/issues **Q&A Support:** http://stackoverflow.com/questions/tagged/pandas diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index e2c713ac8519a..019aa82fed1aa 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -13,7 +13,7 @@ DataReader The sub-package ``pandas.io.data`` is removed in favor of a separately installable `pandas-datareader package -`_. This will allow the data +`_. This will allow the data modules to be independently updated to your pandas installation. The API for ``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. (:issue:`8961`) diff --git a/doc/source/whatsnew/v0.14.0.txt b/doc/source/whatsnew/v0.14.0.txt index 181cd401c85d6..78f96e3c0e049 100644 --- a/doc/source/whatsnew/v0.14.0.txt +++ b/doc/source/whatsnew/v0.14.0.txt @@ -630,9 +630,9 @@ There are prior version deprecations that are taking effect as of 0.14.0. - Remove ``unique`` keyword from :meth:`HDFStore.select_column` (:issue:`3256`) - Remove ``inferTimeRule`` keyword from :func:`Timestamp.offset` (:issue:`391`) - Remove ``name`` keyword from :func:`get_data_yahoo` and - :func:`get_data_google` ( `commit b921d1a `__ ) + :func:`get_data_google` ( `commit b921d1a `__ ) - Remove ``offset`` keyword from :class:`DatetimeIndex` constructor - ( `commit 3136390 `__ ) + ( `commit 3136390 `__ ) - Remove ``time_rule`` from several rolling-moment statistical functions, such as :func:`rolling_sum` (:issue:`1042`) - Removed neg ``-`` boolean operations on numpy arrays in favor of inv ``~``, as this is going to diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index c25e0300a1050..17496c84b7181 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -36,7 +36,7 @@ Conditional HTML Formatting We'll be adding features an possibly making breaking changes in future releases. Feedback is welcome_. -.. _welcome: https://github.com/pydata/pandas/issues/11610 +.. _welcome: https://github.com/pandas-dev/pandas/issues/11610 We've added *experimental* support for conditional HTML formatting: the visual styling of a DataFrame based on the data. diff --git a/doc/source/whatsnew/v0.4.x.txt b/doc/source/whatsnew/v0.4.x.txt index 4717b46a6bca8..237ea84425051 100644 --- a/doc/source/whatsnew/v0.4.x.txt +++ b/doc/source/whatsnew/v0.4.x.txt @@ -56,8 +56,8 @@ Performance Enhancements - Wrote fast time series merging / joining methods in Cython. Will be integrated later into DataFrame.join and related functions -.. _ENH1b: https://github.com/pydata/pandas/commit/1ba56251f0013ff7cd8834e9486cef2b10098371 -.. _ENHdc: https://github.com/pydata/pandas/commit/dca3c5c5a6a3769ee01465baca04cfdfa66a4f76 -.. _ENHed: https://github.com/pydata/pandas/commit/edd9f1945fc010a57fa0ae3b3444d1fffe592591 -.. _ENH56: https://github.com/pydata/pandas/commit/56e0c9ffafac79ce262b55a6a13e1b10a88fbe93 +.. _ENH1b: https://github.com/pandas-dev/pandas/commit/1ba56251f0013ff7cd8834e9486cef2b10098371 +.. _ENHdc: https://github.com/pandas-dev/pandas/commit/dca3c5c5a6a3769ee01465baca04cfdfa66a4f76 +.. _ENHed: https://github.com/pandas-dev/pandas/commit/edd9f1945fc010a57fa0ae3b3444d1fffe592591 +.. _ENH56: https://github.com/pandas-dev/pandas/commit/56e0c9ffafac79ce262b55a6a13e1b10a88fbe93 diff --git a/doc/source/whatsnew/v0.5.0.txt b/doc/source/whatsnew/v0.5.0.txt index 8b7e4721d136f..6fe6a02b08f70 100644 --- a/doc/source/whatsnew/v0.5.0.txt +++ b/doc/source/whatsnew/v0.5.0.txt @@ -39,5 +39,5 @@ Performance Enhancements - VBENCH Significantly sped up conversion of nested dict into DataFrame (:issue:`212`) - VBENCH Significantly speed up DataFrame ``__repr__`` and ``count`` on large mixed-type DataFrame objects -.. _ENH61: https://github.com/pydata/pandas/commit/6141961 -.. _ENH5c: https://github.com/pydata/pandas/commit/5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb +.. _ENH61: https://github.com/pandas-dev/pandas/commit/6141961 +.. _ENH5c: https://github.com/pandas-dev/pandas/commit/5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb diff --git a/pandas/io/data.py b/pandas/io/data.py index 09c7aef0cde1a..e76790a6ab98b 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -1,6 +1,6 @@ raise ImportError( "The pandas.io.data module is moved to a separate package " "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pandas-dev/pandas-datareader), you can change " + "(https://github.com/pydata/pandas-datareader), you can change " "the import ``from pandas.io import data, wb`` to " "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 2183290c7e074..5dc4d9ce1adc4 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -1,6 +1,6 @@ raise ImportError( "The pandas.io.wb module is moved to a separate package " "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pandas-dev/pandas-datareader), you can change " + "(https://github.com/pydata/pandas-datareader), you can change " "the import ``from pandas.io import data, wb`` to " "``from pandas_datareader import data, wb``.") diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 81e721e610cc6..d4eaaa0b5cd16 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1665,7 +1665,7 @@ cdef inline object _get_zone(object tz): 'implicitly by passing a string like "dateutil/Europe' '/London" when you construct your pandas objects instead ' 'of passing a timezone object. See ' - 'https://github.com/pydata/pandas/pull/7362') + 'https://github.com/pandas-dev/pandas/pull/7362') return 'dateutil/' + tz._filename else: # tz is a pytz timezone or unknown. @@ -4041,7 +4041,7 @@ cdef inline object _tz_cache_key(object tz): 'passing a string like "dateutil/Europe/London" ' 'when you construct your pandas objects instead ' 'of passing a timezone object. See ' - 'https://github.com/pydata/pandas/pull/7362') + 'https://github.com/pandas-dev/pandas/pull/7362') return 'dateutil' + tz._filename else: return None From 2e276fb2fec6dd04d6abcf5e79c03853ee86cd24 Mon Sep 17 00:00:00 2001 From: Keshav Ramaswamy Date: Mon, 7 Nov 2016 15:52:08 -0500 Subject: [PATCH 048/183] DOC: add sep argument to read_clipboard signature (#14537) --- pandas/io/clipboard.py | 20 ++++++++++++-------- pandas/io/tests/test_clipboard.py | 2 ++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index 2109e1c5d6d4c..6f6f1366a6732 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -3,12 +3,16 @@ from pandas.compat import StringIO -def read_clipboard(**kwargs): # pragma: no cover - """ +def read_clipboard(sep='\s+', **kwargs): # pragma: no cover + r""" Read text from clipboard and pass to read_table. See read_table for the full argument list - If unspecified, `sep` defaults to '\s+' + Parameters + ---------- + sep : str, default '\s+'. + A string or regex delimiter. The default of '\s+' denotes + one or more whitespace characters. Returns ------- @@ -29,7 +33,7 @@ def read_clipboard(**kwargs): # pragma: no cover except: pass - # Excel copies into clipboard with \t seperation + # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly @@ -43,12 +47,12 @@ def read_clipboard(**kwargs): # pragma: no cover counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - kwargs['sep'] = '\t' + sep = '\t' - if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: - kwargs['sep'] = '\s+' + if sep is None and kwargs.get('delim_whitespace') is None: + sep = '\s+' - return read_table(StringIO(text), **kwargs) + return read_table(StringIO(text), sep=sep, **kwargs) def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index a7da27a2f75dd..6c5ee6fcd22ba 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -71,6 +71,8 @@ def check_round_trip_frame(self, data_type, excel=None, sep=None): def test_round_trip_frame_sep(self): for dt in self.data_types: self.check_round_trip_frame(dt, sep=',') + self.check_round_trip_frame(dt, sep='\s+') + self.check_round_trip_frame(dt, sep='|') def test_round_trip_frame_string(self): for dt in self.data_types: From 06b35db1d88e75a9e3f183b58678589cf7715381 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 10 Nov 2016 00:07:18 -0800 Subject: [PATCH 049/183] DOC: Ordering during set operation on index (#3123) (#14629) --- doc/source/indexing.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 0a6691936d97d..1ea6662a4edb0 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1467,6 +1467,10 @@ with duplicates dropped. idx1.symmetric_difference(idx2) idx1 ^ idx2 +.. note:: + + The resulting index from a set operation will be sorted in ascending order. + Missing values ~~~~~~~~~~~~~~ From 85a64644077bc231cf946b19765d4c28ad12f524 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 11 Nov 2016 03:53:49 -0500 Subject: [PATCH 050/183] TST: Create compressed salary testing data (#14587) --- pandas/io/tests/parser/common.py | 6 +++--- .../data/{salary.table.csv => salaries.csv} | 0 pandas/io/tests/parser/data/salaries.csv.bz2 | Bin 0 -> 283 bytes .../data/{salary.table.gz => salaries.csv.gz} | Bin pandas/io/tests/parser/data/salaries.csv.xz | Bin 0 -> 336 bytes pandas/io/tests/parser/data/salaries.csv.zip | Bin 0 -> 445 bytes pandas/io/tests/parser/test_network.py | 6 +++--- 7 files changed, 6 insertions(+), 6 deletions(-) rename pandas/io/tests/parser/data/{salary.table.csv => salaries.csv} (100%) create mode 100644 pandas/io/tests/parser/data/salaries.csv.bz2 rename pandas/io/tests/parser/data/{salary.table.gz => salaries.csv.gz} (100%) create mode 100644 pandas/io/tests/parser/data/salaries.csv.xz create mode 100644 pandas/io/tests/parser/data/salaries.csv.zip diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 3be02c55ea10a..f0fdc9398084f 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -630,10 +630,10 @@ def test_read_csv_parse_simple_list(self): def test_url(self): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/io/tests/parser/data/salary.table.csv') + 'pandas/io/tests/parser/data/salaries.csv') url_table = self.read_table(url) dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table.csv') + localtable = os.path.join(dirpath, 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @@ -641,7 +641,7 @@ def test_url(self): @tm.slow def test_file(self): dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table.csv') + localtable = os.path.join(dirpath, 'salaries.csv') local_table = self.read_table(localtable) try: diff --git a/pandas/io/tests/parser/data/salary.table.csv b/pandas/io/tests/parser/data/salaries.csv similarity index 100% rename from pandas/io/tests/parser/data/salary.table.csv rename to pandas/io/tests/parser/data/salaries.csv diff --git a/pandas/io/tests/parser/data/salaries.csv.bz2 b/pandas/io/tests/parser/data/salaries.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..a68b4e62bf34a64118a776c3575a848d2dee3eea GIT binary patch literal 283 zcmV+$0p$KdT4*^jL0KkKSwin59RLA)-T(kFKmdQ>0s;s?FadjPCWfk?`V564U}|{@ zw3NibCX|?(r>UmYq#z#udx$b-45?apVahHUu7f_`FRN_siIg?f?>dqH$ITaOEQHr9du@O{PIlb?An)*U(TfOTX zbpsCAbm5!s!W(C(!5NmFQ?yeZP^@L8$($VyP+WvF%a>z(y78UnoV7}xCJwa<@d0TE z#Mku-jM^4(+(0?TNRg(UtccGpkwvS9M@c>Pbnrp>s&pj? z?{#o?>=k#NICS!%dd2YXQg%@sI>ulS#?Cxh1cpnuCht<_GBz9_iVNOp*glYq|CXA11Wb zPp-Z_P{&*cfI&}IE_@P6UJ!a+r?9`gmV_zs%8-StgMBYn%D83mo{#8c-x{fIE_I}t%QC)7UlykOvN~LyGyJ4luv^P5Df@1 zSoyPi3vu5SCKqWQha7lxYqoESQ@84CB2x0;fz$N16pb5^?-^cbb$+%$5YC*l#Q*?h iBLnR-8?NpE0jmP}1pok6{p&Ze#Ao{g000001X)`D)|(0d literal 0 HcmV?d00001 diff --git a/pandas/io/tests/parser/data/salaries.csv.zip b/pandas/io/tests/parser/data/salaries.csv.zip new file mode 100644 index 0000000000000000000000000000000000000000..294f65b36771d2cfba97a51f9fa6df494d511fc2 GIT binary patch literal 445 zcmWIWW@Zs#U|`^2NDD~z^sm%i&(Fxf@RNywfrmkcp*S%ou_!aOSg)kGEHs3Zfq8+m zbi|95G7+T}+zgB?FPIq^z=Utm?$RR$JgaMi?{NH?ETxdT%H(GKBoCFNr>pgN8kjV0 z*q?v@`D@Mo35VbQO|aK@Y4$kqp52U(eUC+HS55Vbl9JE$pIIJGQTn>4{=-c6)LcWu zhgx?kkDTDFikp_*E^qHtw&&N!c*Y4o-l^~SKJm4`!?Dij=duNb&s;qoolBlHce#Ad zyUyyk=~M633VwcfUX4Fe^W$0WxX#o4TIbX1qxb#(_HlAUt&!S&R+Ym0^9{J->!}4{?zXs86LxE@v*6z8>q6LW?aE4 o0Ss;i24E;NENKL>Py?A263A!)9N^8$22#righ4<$6r`O205eCa6aWAK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 7e2f039853e2f..964c927c3c496 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -18,19 +18,19 @@ class TestUrlGz(tm.TestCase): def setUp(self): dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table.csv') + localtable = os.path.join(dirpath, 'salaries.csv') self.local_table = read_table(localtable) @tm.network def test_url_gz(self): url = ('https://raw.github.com/pandas-dev/pandas/' - 'master/pandas/io/tests/parser/data/salary.table.gz') + 'master/pandas/io/tests/parser/data/salaries.csv.gz') url_table = read_table(url, compression="gzip", engine="python") tm.assert_frame_equal(url_table, self.local_table) @tm.network def test_url_gz_infer(self): - url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz' + url = 'https://s3.amazonaws.com/pandas-test/salaries.csv.gz' url_table = read_table(url, compression="infer", engine="python") tm.assert_frame_equal(url_table, self.local_table) From 62b2ff3d31098eadd35dee706564726f6d163838 Mon Sep 17 00:00:00 2001 From: maxalbert Date: Fri, 11 Nov 2016 09:03:19 +0000 Subject: [PATCH 051/183] DOC/ERR: improve readability of error message (#14597) --- pandas/tools/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 86e2e8aabbee1..d2060185c3246 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -816,8 +816,8 @@ def _validate_specification(self): self.left_on = self.right_on = common_cols elif self.on is not None: if self.left_on is not None or self.right_on is not None: - raise MergeError('Can only pass on OR left_on and ' - 'right_on') + raise MergeError('Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.') self.left_on = self.right_on = self.on elif self.left_on is not None: n = len(self.left_on) From 6bac00828a48224ab0bcbce086ea20e7615e70e0 Mon Sep 17 00:00:00 2001 From: scls19fr Date: Fri, 11 Nov 2016 11:00:32 +0100 Subject: [PATCH 052/183] ENH: return an OrderedDict from read_excel with sheetname=None (#14571) --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/io/excel.py | 6 +++--- pandas/io/tests/data/test_multisheet.xls | Bin 29696 -> 24576 bytes pandas/io/tests/data/test_multisheet.xlsm | Bin 10852 -> 11148 bytes pandas/io/tests/data/test_multisheet.xlsx | Bin 10816 -> 11131 bytes pandas/io/tests/test_excel.py | 6 +++++- 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7fa9991138fba..660300e1814e8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -30,7 +30,7 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ - +- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 6662d106ad85d..d3171ceedfc03 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -21,7 +21,7 @@ from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, - string_types) + string_types, OrderedDict) from pandas.core import config from pandas.formats.printing import pprint_thing import pandas.compat as compat @@ -418,9 +418,9 @@ def _parse_cell(cell_contents, cell_typ): sheets = [sheetname] # handle same-type duplicates. - sheets = list(set(sheets)) + sheets = list(OrderedDict.fromkeys(sheets).keys()) - output = {} + output = OrderedDict() for asheetname in sheets: if verbose: diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/io/tests/data/test_multisheet.xls index fa37723fcdefbd014df010d464223044c977273c..7b4b9759a1a94582d4e4986ed507f35fc935ae93 100644 GIT binary patch literal 24576 zcmeHP2V4}%((hdsSb_?Ih@!Y;6j(_jrzjx71Y*D(kgy1dpd`_gC#P^?4tOexiUAck z!87MGoax*%V?xaFJoQY^^Hlb$n%OWr8-VxT_xs*`zZ>Y;nVRl@RaaM6SNHVph4ZF& z*Zt7?9$_8bi4Lig7!W-ModegHsKbO17r4OtI*CL=6A6LKzmLC=2EKr-LTEf4NIXco zkg(15A@Ly@Kr)161j!gu6G%-VHG>2uKujSuhhzrH98wEN7LZy(vV_zMQfo+7klH|M z3&|Q%J4gaZHjr!~*+H_0=kYftzO;SiaB@p2b9`XC#VJ1VN(YWKp`{fAi=;$q>y!8M<-wK))QI6hkrvpMq3qc{rO< z=PD}SK{_`)DS;>758`+9g-;x#hH(R^eDkTZEp=`RXAapzmAOe#Jis&Eg8Y242(`l{ ze)^msu!3Z;f*7!lMDpL0qXS6*8GU4uk|r!k1tbjQ9Y+$tqEKpAuu>m#5LE_AUA=hq z=C}p=(vE@JGnLDvjtk~!T5k@&dUJHICkKc4YppHT#k-yy1ae3%L9NCR!=PRa!+J2d zFjm&bgfW z6dFM-F?A*Yoyl^sb*DL}EpY>_Yr+bk5UqJ%Q!S!Jh$v_{WL}>Ls`-=!P<}3BzZlFQ zRGkiJm1&Y5^`)!{nTqSvhd*>wO{g-V=E77XsD7c8YgC^|s<~!(ReCeZjk>kwIgnCNiE^Hi14VJ8*%=+lCg>DO8HMg_JfK+fT*zh& zjhSpQry5i$dl1H^%0yc?TZ#}6C9viRNf}rqz%a{q5HJgN z5Ga-COUg0sV{oB#0zxUXkwU1@0nFcDA%G95fDud~fGodU1_eQ>&KfEsrXlmPeIN%cDyFK?A-fc{JtUr2%J;2Ko*S^urqH zv^==7KmztU`H(G8)c(i_SrGlbgA!FeZ79?mKT+L9< zPwN?1JCyOaQuw;yizxaZkbyd6c3!4|Uao<@MJ*jySd{t7?8-oGez_kjHZ`>{vVi*1 z{Q_5LWcB*7-H+{Ldcaj0iq6s}O#0N)^=0%cq(33Ku%JWN7gvTT9Ey$$J~SOyi==b` zTtanNbQWK8MgC|)evsqisuC@Kd^{~bZC5yVp!^I@VetSzGmURfrlCOc!=HpEd(Oz= zyOA6vI%eR)3YIxFTj#)?bV%yZ6h;KDY5~mBr~>r#su3J;*`QGzu;Ps>fF{zY0%!}3 zDu71Pr~=q1jVgd<)u;mK9~xBvJA9)GU{7sS0dzHuDu8{sQ3cTXX)O@Q>JR7@bu{+} z9@SAq@n~`o=TcE#4xbHzU9|xqXoi|W(9bsj1Wi*j2s*b0fS|c*20@S101z}`%^*1J zHUI?8S~Cc)qSp%|4%6N`xD4L_5bdo4Zn+^K+FJ)-1T;Vr?X81v5*h%ay>;+aL<2yy zw+_C`P=LUV@>oLm9ZxUqt%EN^8UUibb?|LU13e%z5bdqw*f0?7t>e@%5bdpl@9=2FV!TvL z^dw4g81@}}AUSnTqEPH&*5HLN1@)=H3-kV&L)3%3dGkgNqAvxWK7G1`UY3f2Wki}8 z(s0EH*MJNdGyyD5;d+~)5-k9z0uYETR5!U$7}(*Vwq~WJr7A-C`}?a1g*3KMNMj2{ z(_})i!(JhXp*1dG7wFj70`8410P8u86Uj$j09zXyOG&Y+VM%+)rm!)1iKARVW@9H` zF>Q<;+xW^susJa)aYfrfqIM_$RYO1|{@OvhGz5e_+_2lVD}X2rL8UsX=8P|D6@AR# zKTEw5CE_9#bNO4U3(l|YI`8q7oTyqE#P?(N=k^`I_s zvZ!PpG8>af#Bvas3);SYyGn0I8oRe6josU6nyk08h3X_13e|(2L9I~g{32#%`$eR& zg`(Y!vR|ZWGNIU(=LMoDEKfyKsx$A-N(I=a%E{pmW(@~^P=Kd-E`xe!cU5Nhl~r3+ z5@)$2s84vmudGki0^Q{W)LKO$*9=~!!{pA>y~IJ_CJ2m;6{Tc&Ea69xB{=p<4-U^- z(v*0@>#-8p6C;38Nk9T&pFk|UX~SIt=t1$FxG~Y^iQ;RYe~{?nHL4awbinltY$Yq z_FO3^eNnvRjaWS=a}`dVR5=N?a>DK?<-`}o-?;ZiJ*O5boP1O{d1~c^L!^|Gfhhj| ztDEXMSs*9xdTg&9aPn@D?TNzTM&tL(x8b#*d^i5*Oj<%4Y?Yp`uA(bqV< z5;AgCwR%uH1t{1T8`Opc%Csm#UgUVF2ens#f+?{<9ax}D;~@lRmQ*01tY~A1A>`Eu zt6_t-XMr;91bWsHq8`*y0SX4f26bbBGEIVz;IyCAgE}cd!4}w{9xPC1>k~3@Yfp8c z&0&8NtiK0liPAC>)6+#0AmUPi<{C+wfDL3pfR#pV{7fh5a6Lg9aA5$8n$^V+5dNjE zh7X$~AkOQNnzDrdm9kNBT1#|D59$6?=oF$K?#2Dlefh5V4JR*osai8QfpkW-MB9BZ0oR{g5Laq^P zG=lnuq-3NaKD)l=kST4}LTllMM|#WnBmlyqy7KOb^L<(G!@0Vm z_tAH<-bdfbdLMm<{h|}1h{*j1~QY=?8E=uGBA(ppb;VlSx9~ z$t0og3b`#H41-sMk)rGjkwBC^F*z|7I!OEyeovM{I!R1OBq$*p+7|r)UQMJ-Iabu~gNkFZ}UU}1b`!S>G53yti=C&@vmU! z2-`&g3nM%mwmA!y9^o~>(lIcXg6*n+g)zPpO9^HySRCV}9fzS!ST!&teqhZcGBqO_ z1|pW@*J%ZHhy(*f-+_IPN4a{XzK6yy(<5zivO9GAZAdr#cF>S#X^HWaMWv<);zWX2 zdbnA_FH)A9+fve!41qBK_ut1ulGzk~FGB#kj?v<9RYWloG5dttfJoLm(bDxmY4q4g z-!c1y9=r-*K3l*aMiwv@X%<2KP2n#RbP)prh@fj`z?VqyDjZTAh>Kq?@dSsH2$2P9 zO_77SLEncgfv;NNdNYI;@GX*Bz8-0}#~kjikk4;*$s2wiAr-$8JQ0~2jd#Z zgXP3fULq+XMQ|@Eg0g@M!!GfPmk=+p80e52QxPQS-SP27WZAN11Tqk>A|Qa#dO<3p z641P$ED%PSG{tZUc~K$|GM0nuYWeVmvi<%@*C#;<(P_zvqI?M1)`cKRmjotHOo+~h z{yM5J5_qJCC?k3$-;%6kv<5!Gg9m8?e63{z>)AYbVgr9iIebX)H5XehxeEENGV-A; zpW*8g#!?)(@8y9fPdwr4+_>lrkpx}`@ku+V8|I7UTR?&f>0u%LDB(ZcKr z$=@$Nn{?dWB;-SW-OLNSucz&IF1>zk{ld}h=DyR>o02^Dx}8mUk2fFMb`>sd&dpEZ zT@oMqAl@>n-O0cg9*$4SE>#@7V0h-wlLLl6JO4}Htg68=C3<=h!=BkW9;&RW`uP`+ zu>;p{ihQ=_D?iJ^24nYJShvq@PS@CO9?i%3O?kS1+A)hP zor)(j{nqvhuB?7gmO7{0JI~$M$h9QCeC3Y)1ws1-w+6kPD;#+`ZNRz5Ws_}cT@7As zU-`6kWIL0OO+xRvy9V7q|F%W1(77K)kG(D)p0N7F$(mZ{=cmU#IPqHqsgTUBJLmoO z&ebSTg#@B(Wo4CK-TdSTJn;d2JAJ;MY1yp69uly$4Cocr2ESiCtV(6;f%in`Wr#hO-keZnrkw-|V>^VrITTh2&!9CsJ5i8wmF%JoqQFQxO+u*8Qk z>$Y@!FzC>tEu!)())#i|{dn~YR7JI-j_4O*!Jglzxo)=FaAM)JA(U9w_Uih0i_tUn(*gq0Bbzqlah)q|)1I7Mr z`1yraoy-s0z4o8?c+KN$V*BBpMmiL3G^(9Y;pq13#oop9tj_zcoRnoX_`CjJ2lwb< z{*&%(D}7#Ml7sM3)o|w-w(q0v-ta2AKKppe&~D~eYWtpid+5peS2ynLIJLa$R}ckB>}tT?ogaptJ78{Yx2E7*)A^r{b-LU(SZ?T;6KH;p0Pocz)(bvCzx%e24Alv%849X12NZdc?lz1|`$1 zu5HacRJr_9A@Oc~wAUbEdBmRDgvqyzD{?Qb)w{EL#O4uKf1e}y)#AxAm%2^|d8Ku$ z-KJGN2{Om)?8!#2e0R-xm5^=F=k;xi!$XoXi&uWTyd-x?!tQ+GzA@EN^L=K8S$1tB)VzB3v@#f-hr)A9+I=bFJ@OVVQ%OCba{JF89@bvw&U#%%I700w2;5l!0X0J`0 zmbE3_<}Lk@c-i?zQuA&j&Nw@!-`?rJ#%@^4@l{n(pa1ytgxB-;b2o2ZxMpqlKOen& z{ory>|ILTJkL#F+4?aHh&h*zG_N*N-z1ydk)rD2!KY~w4>MoD_@ynm3t@zIqetG)n zT7)FK?%ku#Yb76#o9P^Ev0yb+VX>U(BP@d9qj>*1ef&Y=`Z~*|O#jrkOW?3_WB!3n*KB5=Gk>*n)`#u2J4bqW z2-nPP^26=?PtE#d=)0|7WjW+T*6gWO7hbivS$D_fUFELN|ID1?cB14`jMs{ndmq(x zy|wSl8k0VsruBXuZ9Dm{fqBWKX07vA`%S(7+1IDp-OazJ z9q(DSU*E9kx2gBzsVj%2FFR5;z{e`b$ZvFB&*b7YUCxRo9Sb{Q7dWkUOxoKlUyt9- zUsPXpD!X+kuIqPBw_9+cKRMbjJ(tmZu+Hq8GJNacDP1{Pq}tF@X~wx2WNN- z_pZz;4GHd&!^@A@b?lEJE2G-Dy58Vks%>`C>FOfk{gM1lD@MA^B@23U*Jl5m=zWTJ zxX)#Sli;xy?&4s0Yo9))Vv2>2OvBIXLPp|Dge2eFB zEA$3eY?@Nt>#N_s-#^Z3_UZQP_f;0HEH5Ze9OAKc*5Fpx3m%=AbMRPFix}tSN%;|_ zUk^&Na@ksQ-Ra?AE_cnT^Zhf=4=^!LdVbD3X~yf_#zw!K8}a_;53TREx^ixiy?OPA zAt{~jFT2K%oqQpC-OtUkR_(A@=~$6dI&bH}!~Ih&um5gc&{O~LBcXFdT!(vgg3xU? zzn<;ZyMxuX;FoWc6OVKpZ(6wNOUc9CZGOD}Xa2z27Z%0kgF0NP9kS~}T5Zz4=ga!8 z&U$2HZeg7#>U^lnv!gAY_KSu6<33$==(G3FEm?^_9kVT5ZXeQn)R^^4U2ZtcIhdDt z^JId4Gef^@xskh@-k2y8@`TW~KHC>jC-8}ex z+p3Wrvqg70`i|*6Z^LcpE5&_EA6*&ozVhAO?JL3}_N3$c4CKng2 z-3?=!6%53OzuXDJg{Xl1-$BMM3PJIfCFU*Pjm|N+)Vd0R6wi`ADo<4A)*N4oUlGm#^ z(@xkQiZS`RnzUNhf2Cx{7vI*i$8~IHF}iL*VET=GUFY>D#dqyim51?u9=U)2Pj{eV zbWFnGy21N(O|dFuV;pKC*ZzH#0od`IqO=?6lM&6a3$o)OML`RQo0*NkRzDkw9h@kT zE~`VLLKQX!+XxgGRj@yfDUBAB=-*ctVkDdc>5|b<`*PgcPW>VW{mp$7rEl&VadFB> zAFv=oHU$uJI2eDm1HbET4CgqQXyLUfobeYt_~dih>r3G={r#gTdIyT`LY-$&lw4{G zLSs6>f+YN|2j?4jaulW~^zVJlq;ls`XGh2#PaTi_ofy~7a+Ih3?HSkW;Kd9F5IJz> zzyxhk%iO6jQH0~NzwEQ($zZ@XBstV{3w6*8jEzU#gGxBneBAmDl5t@f7Y?jBJq_e@ zVF8P}wuFpuaGDXM;SCSCGHIv}HVvn$Y#M5SO@rkU6&f}hGv61g)$o^75S6~;P`{%h zyW_$gibOxtp;`)mjRiGC@VlIF2*x}Nz;zo^XH2Yb>s#v(>t-Y{ko*IdZBT=#3Htz8 zJb-J&qYpA@FpQH``{V?i`NNRO|nI7miN_$>>pG#@}=sX-2k%g%~A^HEK*QZ%SSQsk+lkd%I` z5XqIQ2vr&@f{)QQXpwR$k-#pP;GkbQZyH>QHxE!`q9Hg>nK#_Z9}D5Jp0Ya725H_U z6mO_KlQ*tUD}yWXhW$eF#|W;g%o}dykKHNWR$rtIi1kKWr+FWtc!P&w@&-xerD@5P zcthjMAJgD+>!86Eygew<xO5zhfLfKm;2n29!0RUg>Kr zh-ySS+6PvP&clH_xQ5l7TK(?a=43Zki3H$DdYtB)lGD^8O#nrqF580OI*5X%hCW(H zrwF)J^T+X9l5vbB9n=UMI>E5v8Q%XCGagCtl@W){V&EhXItEhxdHw<~aoZS#JJ!f3}-Pdj+$v)8gj&6fDclqpOv1764~- zdzAA)vdlpPGCF9cFU3wK^C<*Mfg8kbkEO8VGQB>ZdMxkPGSGUwz6m9Y2CpALNh>Nx9es`%75(${q8ua>y}37dW~`2 z%7HTm@rurYu3Kw_P7;tax^8L!kj1?1pr@3m;4I2A6}LP6p0);nLwl@wkyL zTtkGOOYz2dl)+nr3zzY3qzji1K3r07gbtcT2TZwhhz3yIXs)sd9miBULdW)_+`KG8 zr%DD;XZobs%^N^Ts z|9={&g9U6IL$m_r11Wd&*S`4}kzi7>m`_}scHD3W+FnSwzJ|{OAYn+{3liGOFi6cI zWkAA>5i20!&t7bVgqs$&K|)(IfhBQk_{1`OMDg)yqAAopt|ty!p!_4FK%$=E>+SHw z*tFF2)c6cRcyw%HN=9mWf*>RtHV(nF07$u=ldfF~y1>Efi9s!^cfoJXqFrk{wAG(m z{Y!PR(8{E6In>|vVSqTeran-Yc+fI8rLgcGG4Wu3S_2czi+Yr9nxXnGwOa<}53ns6w#g7<$cAiC z0f*DSLXjeng}?R literal 29696 zcmeHQ2V4|Mv+rHfl0gtf6qbyFWHBE}#ej;+nJ^(NK@k)%%Yh1?dU}fBnZPV3=B#JN ze4KbHh>D8Y^Dyh3AL@zws%Ca}XJ-TOzW09bckj22Jv-e~{jcik>gq7Tt5?h)Y}ni4 zA>o8?h!*)O)+f5Gd>)*4VA5s;u8YN(WPaPjIb> z=@K5~X^}!WniFeEx+c`of;#S|Xpd9JuA~W+BV@;*ubv&Kt2{E4A4Hw~Kph>ZqX`^& z27%nO(2+>!GVXG}9I>d$$hw-VN4kLt) z9*&OQoh6A8ZKypVtEq?VSmH z8ea#}l>QG$E@&4}`d7@~ts;Mciu~0o^4F=z-=-pehl+ev@+?siG9>siB_W8);n4i*eUMFMVv;~82 z#N;InCJ) zit16#i6DW2Q&dP&wn_@3C#hm)W~Nvu3E1oiz|PZW<-sK?7%0ih5O9U+2()aW0+kE( z{{4Fep#lQ~6@+5vF+yRUL?*%r#m-X{3YLj1p+0A*P`5In*jo9|G{xW$3(k=wvx92#aeU# zGTLac77Ftpreli5WeT;}w{M?<(XjKFT3{YvQ-LuWcAnxK%L`~_wLm+NQ*DOeTTXxM z0tK|C?p2`{sCTR$+o_@6nLMUHD*7!4C?)gTkWg%721K>r!Wp&#u@(`0tsc+;W~eA?{RDh2!YwNy0HvbWew1YdXQ!f0VxYt57J5w1QN=1 zq8`MlAs}!g0LWC^39fJ&pcC~VuwJSkq%2%Lh^-t5B$Vq!J&1EdK+yZB(+RGk8=w>Q zAl3~5DGOH*VkZXz3FSIb58~1g5WG90PA9lRZh%hIgTOk!evqiV?o|VHq8z|*!B8xvpRO`M64w1pTcN5;g)+T-C$(%GosKvI$UR5iJPyLvuVaSeg(qr}p@)v#e(ny=sDt&)DjZ5w+o8zq+JuZ9iV(tQ24UR|uLHb%q&r;*ZeGng}O zwi4^994iTlDyUQr>L>#Ri{gUXb3oZvM98ZQALXD^HH z2qpy5>Vvg#L0fY`*%ks_se~v8#itM?ItAO{g0|&=vh4w$WlBA#9Mn|?3YNeH_2GcB zTc41LJHwQL3Wytwt|TNnHc!w|zU6r~Vl5H^+l{f(5m*N|MPP>z^@!)VjVSRXX7Er0 zZ2E^rC&f%oicU{VNzN0@mIvfpi46(;yiyFA3{NdZ!_+I0q`(vmS28&EA}DTWF${2f ziDIA&H>&U=?q~3Tn4J>HrS#=e`FvA&Yy)mzgeHh$#tTD5NlAHvOpaX5#0DT&0!#*D z$at6CzIO zS$a^ylT$)ZY@|>^o>x=i1v7BcecHsN=;Y}1l+@|MNYS+PJVBT|6u+I=jC3c{;E6qK z`( Q$UdEpz}x&XByZ8ZeB61kBu!cFMX&RY|n(Hq=2DzmdE9_5Nne#P+AJdRB=a~ z!2)IsmI*xm0xO7~qUhMf*8Rn> z@6R+E-W>Q3yT2Jskiaj6Cnk><#Y*~rAC551!7t&{h{@3VMbPAI51=KrMyW4iybo@1 z>w@BpNPCc<^b<`^PmKom!g0*Jmf)EC0Wj7y6>JI}3%#PD{3+ZlVd`55cT^+>4%gQ6bvL9G_D&-&$N}_!poPu|j_LmOWU;3q_i}D0x*XEcZ zlY~w&lF%te5;}zrvs0E1=*~%A1RWh{hzO{2s&wIxDnnE4jHViyn4SdM@{up4WeUv` z3D%Jg4Tt(tw00gsbfF+p($NAqdklK8N))XYG$*TBQqhKudM9KPJQ2Q$bqxzH0w^AA>73>XAQEzF z5TAU3C#6AO*afg$Paiz^;sI~Y#zv=$#4srkkREWm8|h*MRuJJt609V@e6m~+WVD=h z_Lr=oIX*MHqvw2grpryFZDa? zZPvZE=a;!p+x9Np7bI}5nR9XR#f`H{ep%&YReEdRmT&*A4KI%jDHd8Kc~tq;uI>Cd z^@N??!Lut1xAflqKE&PWg-zJVqWkUpSM40UV0Me>d9&oq$iv>$NJT48r`BdxBo>oJvSobF=$`4VQ`D!Q@dSqbPV0NM&XGZ~E!>w9_PF?-3Cac8}a=Do=|G51==fbz4=ou_ye z<%W7~${-KjG7}tZ?%2Gz)BmEb!7)Mh<~OxlyMEoZ@YBO(&xBU1x9OgJ6dHIb;o+37 zr!wc1=4{^LEQp#m@n4%}-Z?n>c*M0$T`v3B6!kph{bLi;`BnXo58M?Jd#%*0ym$9I z{$rgow=A7B`lj8=wbl;;!Ye*l4ZPWY%&sNHrQ-c(z0212Kb^VT<7s#8zA|v8IF~?IXe$l3vV2B zue5*TQ9bI2)3~k+!-o7(QQP`a_ZRLi2K)}ngz?+P7xH{(|gVy-FwDu#L)ZmAEMGxQzwhxucaqQDRNe z)hN@V3BuW<%1_1JSr(A<#W2G!@2U07yVV{YH#?6Xy7^k|y;*B{r3J6=Dp~nbHG`j5e0jkLiqji6DJj}w|HxFZER8Gwx*8V#@<{oKsz}l z+$A=9&4%=v*Pd%?-3l+vxis6nuWos}01wM4vm%!ZPCnrEEsW_FU zN0j6&j~bYnJgGL-{8em^$fUiMFF!x`+JDLDLRiyw@xEQlTt9TVzx`mE*dXY<%dR1B zDrX$LyNV$-S-EFu()NRb>X=@3q)E@E!o7}GeqDkciyx5zsVmLm*M*0= z9k>48dfvlvwg7iCvx`?OCU9CME9`-1uqZ z7~hNYN(+V=g?2MCw49T&@Wonz@q-UK8v{n@6`VenpH*@zYsj-&qg`{oR~e37^=Wu> z>k_RC3GHt}K6cmRWc&1_g3dvEm)^cNOZV5_NuG1fZ2VIOS{nMF z_pw@h{K8`ILpxr3+V>b?`ion1Vc+bu6&+f2+hK2=7rH5TaF3ZzbB=o?Y1>!!NHTfd zp;eCw+h0G5vCwa?z0vA@X!sA_qq?Wfb||=H|9!FT+YF}=o6^$`)-^xdHp{JP{Zm=Z z4?fQV3T*Uu4C(W^omsP)w@v?W@Ljq<7#6VZ@A*EhszhPI^Ea+C?kt>C9258`EMns? zM@qAezbh&0+-+^BZN%|*)l&rl1neCE3w0)Ou+iK6G z&*z4FUfNO-*`-61?eFr|Eje59ub$TqetzECW9sf({VFTo=VcwpuxN5S)@8xAoNgH% z-R5pyd)Yq0s>wJ}N>2BRGaK!zy#^gT%`bjBY<69~*0Ej7U(6iC@9{Pw$IbRqm!hqs z;)Xc?IU#OFl=lw?S(nf4;}2ey+-uO8?j*ovLtclBe7ds~`r2nx12gfyQ=bYIxJo?18GB1!p|=k2Mx69x1g$X?4B`m7N@j-)ii!-AGf|PP6ORMeO9KLxc1I3y!w6Qr>tEa zHZ2+a#9`A!yjka+PK-61SxKyyN30g_uj|laLA;lp)yS^{g43#U zblf&xD0|>ouq|Bs*QldM&)t_+oyTtrPtB;Bz5*F5D4EPg^pOYP&gW#My+NO>*&Vka zcYY)*Xblrbg$V~s{ahsOWJQK_Lm3h^b>U)gjX;KR2KIB<(x@?sh(0yhW@$q(fr*omVrtcU}$omUx*8u+cya;E^qxAOdSEd_&X- zjVZ}^{2X8i9ISnft zLYhLHOHpT2M;D-uqr&fjuB>x6Das##ty$+A;HCm1@<1jYtio{BcA2Pzkz zCWr9tlOXE1&WPCF)3fCf_!k^ZzJo0X)Ggw@4b&xeaE^GA5JYV;!G2vgAS!_(S~LkV zp@RTO;FaaC6bWBSf?AJ?G;3*|R zCeR{m8FW}Ow4^S}$$-XA%dno30U7}rfM`+~@SS)@24nz@K$FO10G?73WCAA4mO+;# zgOEayUcL!0(bs_1r@==bW{?7sP5qtWHOD50V)|6~N0-PW`dVmy-@pSrILH$;St=mO zLU6+i#pG>a@Xjv6HBC+7h1zeMQ>UfRm&*k_6-orc0y9dLwoP zrZ-{_08c5;NYQD7J7#)gBb+w6JE_w~f69m^1kq@x4VN(+LA9`Z@rVFM0{C8wq(WFq zCM}L1632k0XeTZ68wvx3q7R2*hb@xKhX+w2p%0gcgy3BNEg8EQ-dx`e&RQN^-gl$t zWvN5vyT?$xq4sRvKq+k{di0?rXLxgcHaKZ{a2aoOrx5B`ynz=pZcx0jf3xb%_1BUC zf#J>d)!-o6yt%%bHKd`hegH6xug)j4;O`8t3l0eoVR{V#JvHO2v7YF|aWK}UhC@7$ z3dMN?<=bIny^;Q0AHb1`KK$``{{#MfNf;q3`ux-VIefB46Z*Y8PnloyfXgcVc@`y-8h=iM%lTBcKSyvi{@fETtMupVDc)-Q zISnqyo9)jLT#Y~XhRYC?`|}ctw;F#=OD@No?avWhjX(FHM59XM3a(y%PL1xUSMcp{ zjs-Er_;ai^?a$$pUU&{qr%C)d4pBHep#sioRL;+FeS+N!cUrXSo1fEon&UrA17G2` zx-KmLxdBJ=2RZBBf9juLwtcC9xV!GJ@`k)U5OFUEuLnWIiC#~L*mH(LYyxo_M7-mV z&pN;Z*6;`;eb>Giev#7?C}a!2Sf;lqE-qCxol4`YVah%-2ieqh+;{4m7?YZkmJ*jP z>^?0 zM10yj2_ml5@ueetT7MQqe7-pwBJKw)f{6PA`ysOT18^?^1MWH@GGbv+^=k`EkQR5e zs5c*Ap9hQ)bVKvB2ju8Rhs4o`I?~7fJOrv4QMo= z(SSw+8VzVPpwWOv0~!rzG@#MI|62{Gwoe+s(6Z)=EcAi6{JfY=ryd{BV8?hU{A$r${y3%BKJuH6~aQ9qKWC;q<8v8AN?=13v|Yb;l3p#8EO~j_li6 y(t?~5H{g$b2>VY%`MV+pb{+^iq#JOh zopQ*U^((cLw-oY+%fw>!k(gTZ!=uG(#O+koAc^2i!+XTxlOt8=s=EQxj0!80ldp^j z<&TIeF8xl}_gf{3Z0OeZb%MXt8{5plaq}vXFp$2Oz+^d61xWq_I>^CujP2T{P z|E|cEWUzW@i0t=1<%Kiysl780(q42U^TsW*h-8UOX-+i9XFQz2QI+?hC;q(@k_jFS zkQu)wE7(pa4gdg?B*jNk^H=G;Zk$FShlj={jVmB<;zpi?oZx<6`_>#1#k2elwKIAK zg$TAN`qPOHJl!7P7n;{vH%AT|^}qO*;NUeaP=-G+Q_o+=rl7XN+A+qDyoNnqiJ~Sy zAw;?W3+AR>&~D*h)+JVZs|o2y)GGO9nMZ(@U_ie-6;-|=Cel4aDx2&d8;5P{=Kz&+ z{(5dQw2oP3t@P0gd26j&oNSCr^)dIG$Ev=sxgY89NFS!Op~O`|p_=E&q}?%psN zN1OCezBm!8WWYt}#&qid$Kpu#y(nV$fzr8QdMR=xo*bMO^z(Ebw*9&)I5Q9&HSNSW zc(-S*vFt>nVp&f8se05!yQh-?H2^b?_FiS7)|P7(YJJJ=AY@{dF&607 z#ZdgxDne5)s<%&?Zef1wcT-}(%xt%vgWsn)xP#Q8aR6MX)#wV@_@ ziCz$f3qQJIyJmn9x70UmS}V2JEtp&TMyfgm^oV6LB;5{uhmdi+{U)0b9QB-%s{ z$#CivxENZ)Sl;nuuA%4@$1N+mU4lInCf4B1AjlSx$2yp|TiI?{!&-|i)TH|p@rVS1 zydfGQ_H-AqLI=mV#Un@9!lMVKw9Wi!`+g1BVGip;KPb4gh*=vFr8WZ^hr^AW-xc`M z=T|=Vas?7Zc>OXaJ^;qfKmh%z zKb}Cs3+$pf6OH|z^L>64tj}vm3oF7{b*!XgJ(!fwiX|Br$Z+)}(N0nGlai1CenyHW zK|$1l(6a5;>NA8;Uzu(!a(`He?sw7^2F|Q14qkNbjh`&8Hx8xMuHck=#BPVxmD#1S zr#N89((utuHrcI{U!r>~D+t64MnM};`HyH-elp)v1iFBZ&H#}M@b?UhlFC9G{$08( zbTqay!?b)uFFs|~2}qHWZ`-+&7UG|hOqmyJoySm7zju^oKy2Zs!{_Ro6Kid{(H-jm zlGEfwMZAc+@8g|g0eik?#*k`I@+USNisEzpVKb_N{7a;TH3koxJ;PJ}aS=N3dqlb^ z4W!=HuTy_G+&#+1O*GEbFzzilD&#>re`e4IMa%OinXmTftVz3v0eFae52QkfrlKOq)BX0!U&?nwgwoV~p;P6R&2oY;D!DV(hl zcIiMovN9sl@`93Vo5R_lZ6>p(aeR?VM8X_>BsJ5O+FX0~z|jbQ&uBI}maUHO22scL zttbGvqA1vxt&vwbc`ruIDb%@QqdnwG;9jog3CAYi*iADmLhytQh8gJG)|y0fX!Ks% z*VdXo{JI<9KD>HYTAGlPD{{AcdA2<*D_Yvc(~kgD*6!C`ho0R)TXGGc!uQ9su-egI zQUgzq(>C3|?}|aFhqm8-?9Qt`c|Tt6^5;I?_TwXbA}Gv9r%1%5pY!lHUp1$SRYsPv ziH|(X)W?LE;=uq0Ifovy;qYu+(H77dksS7R;D?5#2kE}N;i>seY2sr3jEDFv1UDd_m3vEhulh@Xp_IqH=dQSeAXCx^!%gy(F*1_8VZ&ykS9x zh5@gRa0^#rKvY|RB%43H3kcW3`bD)B9Bl~&C1@G(Zzk?T8B0bF;$4%{9h^RMn9%Oe zi;0Zdi585g$qr_XDv98{uwZKFw+;;IYeQS6SI;vSRj}ree(wQu#eay}XO?;`Nv_t! zz3PQJHxbAy|4`XTGfY_;Bsv!#a>RWzL;`+`9WHw9@uF z2Up?VU;fOD+G_(1nCFT^9G?B!ck4lb4`-uPZ+Id&B6Cn~Nsn_o7nl4JfhTg+Z;?=? zME<*PhMrq$Pq{iDNdU%c<1kr=tQ^4+E7v}y0UB|C=&H%v3r_*gQ{d6;9DVspr=?H| z{TczB**zUNdCVP88zFkBLLE^)rgfqg7uNw2RB<+L>E&x^p(gYVk zw&K^NG`Z(Q462LiRCx+=4H5!bG4Peh+La|fvvRL~R<+xZM1Nr{Xsh{lPC|Q)Q zT{Z?)OQy|7N|`#UgArkQCXza-;|m9a97GJ|V)J%%h}AJn$P@qu);2x$ZrSNN_R)fp#KV{p;{`P;O&XA&h+$Ewwf_NHVriJ8`HGO1y$2S>REi!kJD6-W zU{?h@A9ZIL;dRR(l!@EW?hd6&0$cghF0k!|SBNv^s3Se>^08WlrZ%nzkJw#(VwP3f zp%tr|X3=IRo}qkoLCzmu_CpY1Rn|{IKeQS}_tfVr6P0wqAi!o$s+M;>^M;NRDKIMp zo<#^+?%lwV+T4->vfenn^H;qU&k4Y1xiN^caeqCC(GQO}%sr+ikbk;#5oNR32LMt- zP0TY`Z8yBuzairI#^o?IlqAVg-@hM(V0;5~;$DOubIpSGwr@Yz(>1%z3wFo!%l?_F zG)mN{tCC-KSG8K!Ny!mtalY?$KLz{5#6gj-ie-O`@HEC5dqLVzKYiB_E{d|jZH6iA zV9|zg*|V_PNfTf_D3GGM<4BDpnX@mg0T22oS`nEk<%v#0qn(dmZ0ul+2h0_-3*IcI zbsj5t#~y0iG*jRo2~?Pj8JWi6?N5vntzl6~tOqJm1; zaFBT|Fx4A<=_a1wqvaXmQA+CM95Y6-T z`|{)OE>r0H-g6Q)Q-?rXq%vK@N(|+RNQCO3IamoZ-$;4O>-ykbeNP89*V-m3Pu;gj zFH|~D5#c^QLB%Jv@AC2t9;z2Q%+8>vC|FUlhe6(1OM~kb$DJle{T*Ezv0f@?N0%88 z9-_*p$AAg-YOB>455tvrydEf4`BxtL#+g6%C2u0C$Y*UiuECXs1|#hRLL{l-WNw}I zfAqo?69JuFuMMmJ%iv_VVm!^)f^9znS~5$=3Lf?jiG=nNC2L>V<#(qrd?aWI$bVDm zEMwTp?6S~P&FLT=javZ{t35Bg1E&PSjFHH0F+7z=J}Cd3oWiP}BoiU^vW~On9>TVH z?{Q)<*Zu7_yMK7PqXv_F?UfasC|39Ze)cT!REw^zn{RcZuLZ$Roj zrZrO8nt9i8JS;?3|()+-qF_Dd7I>E0fd6^YLTAIB=jSnh-?nW{1g zE_j4UdJ{Q#o%@|$s`9sqVBX|zUWR74sU-{YX8Q-`PARWGJ=|gH9O06B194V>(kNcT zn3e_U%@D#oxxn%5+HI(FSG{|i28Sgos833HEA15wsDt_e9^or@I(C;#j(N7Y@30Q^ ziT}zWV4P%$9G*R0OP6u$1G2H=z(qz9p{ALgyjy|X21XZWo;RgIDx%`#2TUqqU>tWH zd{Wq>8^>w2%@@GxZJ%WB@1>VycUkjYxB26%_beOr^xK|E;>>th58(9ijyB%k zvtQ)~FpEb>+PF_91(bY^Nj2OT5|thx=X4+rMhD1KB};|gCUPa&vU1Ur>N_iB-3#X)n~B{Y1E zo8XmaaDL!@2Io8VsM!=!2}&B2cNb-Y1!H=h^w^8K3aSG+Ud-V~Q~AJZGN=NMZ9x^C zkMG^-Ls*rL!wjecHmg=`nH_W0Zsn8RFmcyhHFufsC85L2cM!We3@OuTP#K7Cw=xSS zqfoaEFx0X4Et`9KY4%@g!-k4?#e(6A4VEoZEYlLiAjQ+2lTa>oW32XS@A7KzK{up5 z-&Mojjv#2So#V~sioAfX2TC(=xq_tpa$OH&!4|raQ@1+Vv;XbW#Buia(3Om*Gtb*> z7vSQ{9LJ~XlLOM44|CH+7X@Qz9~V-&esh3yabP{;EwGMElGWICevGMTr^Nf|RLQ-<1&(xlM3LLz~L*2Bzhrb^Eb#W6nl=iE? zo}~(5Go%4x=4UbAEEldXQ+d?)_;}>W)5uDto{}G;ZD}U@!VD^B(j7wqg5nBE6d|Fe z=q(5_9=2_xexjkpgIego&|kmWrcIjIHI$vn%2UE&!Wi;eonU-X?GVbb2Mz5#&oXbY z_BKsYAjVZe2M2dAOd9am(ms;3;+;hX2=`T0!I?{9Fy-=;Ii`3Awp0_zX2QEubfWcP zH6I)GqygRN^w@>~dqf3k8O~WGD2Yh|g?VDdt6N~x1x4pBXuJb`v`1i|+Bt)uqhi4~ zT@73odE^mGL#E#(gXCHpWa&m?KtVAoYL@-rq9O*uYr!yrhvK~;I;D*)>mSXTi%wS| zt8Vt8i1PYpFzdG{ni6J~VA*?#=D|g8!e34sx6`l-)9YW+6Gtk2iD)~D#f~|`)zLk5 zCNcCJJyk*b&<*<(Ep0L|30Yi~b+apx%8DACb5vrB*5)GN8piO~h$KRbx~3xDB!=i$ zeocAFlm;_$1SR~rVsGoX*m}|B;O`>AQP~KV0MUCL<6OUE#SW2^m6qeX2iQ0qIX%}S zCCEx!3MQaB_gp&Hf7S5&P<*;6akik>1=5@BT-n`0Gd|7Rl#Ti{oUg+P3(p;1@Vv02 z--0e6hdQKuH0y8>2M!F&e;hk!@{U%lpt;oeiA(>%@mh?yO!y62#i0*DlYv2rZ^YGr zH#aimFwvdMvl&$_9)TdgYP^B%7YqHxEH{N-vDJudbk4O@CG|)LZcI`s1jumpt2DIN z*E#D9%5Pqa+|nkBk3nK1R@HzVb;P0lwJ)b(pVzAMTnQzV#EMnF@F&D2bA#LJK%iq~ z;p<&PLs==yr;;jJJ~B+F+yA8Xa0CRqV`}SIN@z$yZ-WLVM#HQmfQG zd0Un1)UA(+-8klL6}u?Aic{*)lPX8vU~aXxL0R1m8w?C*>&L;QSIW*PI4JNatwIqN z3^q!9WD6!c45Aq2Pjfl_CHGjmk%zLu{00%sUOd_qqOmLLz}qz@h1JXnk0+iqA2fc| zlOrx~SQ@I(PE>SY+vw;BZS=_Y!{Y_cA;!qYdw|!DC_x)acKh!OW|@PdJZ?0L;KDly zO2gRx@8ky*7#s))(;?l27Cp0f$j`(K)-!Cq9oXI6eH^UZ{xr5lTIm)CnP z#5<8l`$COX`8|`zY?0@b2^mMU_LEriev5w3T&=Qk%BhtT*C;h!KkRLbOUr)zE2 z(ow88N*aFoBqbZF)p14GMvXMhoUuA{&PgGk%>?L^-iG^yrLe5SQy^j zufaNiBUL?NM(A&4g2yzRYkeO`ccBdvC7xMQX))mvCCXPoJzGJUWxyaWfxmsLYGwJ( z)J3&*rhm=_|4#>`o$zUkD1;=IGYih45l;_<_% zbQLO4ris6)Ug6SoZ?Pnbi|3ihvOSZCuBY3HWaz&MYz%TG_0D%33EzfT`sl!MRQBu7 zPqfV59zv%WMRMTJb*f6LT@tp(F>{YptG23RhpNUO^gFPr`)%&WT0}GMxfgm@_0#qB z)a|_^pet3PEd2b^q%3_`)ak|zU_H7+<;2mN`Ap?{%=y`AgFq1B@`q95)&hWc>) zE0_b=v?5s+pd10=-%h8&*U9%>Igu1F2F#cd4k2`Fw{}#*ixDtG62sc4&0%Jd9Nh{G zx=kJqj=4jbf~o4nE7G+JULDNj+W|8WkDX&EOI2`%L^f{O|LXZ(=j%HdzySbujcAmV za8UoGNtI9zV8n^CKgyJ!t&qDDR^8_h#zbl9n1yFG&5?1a;Z53uG`FN(=}67dlirIL zPkSRzxjY6xNBaCw9M2U7qL2}N@}y|pKE_Q-wx8R4;Z9{9ey1QFYrtXqJ}Xzcz`9_{ zrM8l|>07UNd=VzTN37@E9Hs@0r`lV5fk^0EbDjMp!^ZO)9!U8bUa<@vaR!eFR_9{A30LfAs9Qnb^3iK_BR>VuF3D&E>(Vc!SWebjoo3v;T zf4|rsP=2enh8F{`6b8*CZK>t#5%*;yB7u;m47vEgYSg%>E~n`6b4N7u8qzGWo$+0nX%|Izy66q>2id+C4c=oW=o zwIdljMjUsUb}*0ud|URu)7C;bVx^A44?!iSUKwwXc$7{5b(utrE7QD;klqxkwtbp- zjH0+m=<$5Z6rUBXkdgZv4shxbg~0}^O}9q`dDTv~ijE(+XN?a$v@j4W4;Ahm6O<*X z37QmZq>pNKXHZ}PmErKKpBCa@ zqDcvXNc;-Km>s^ToiNRB!Q0F(%k0qSWdETO3(+t#3A@AilhUE37%`g;&$q;i1OVXv zZ%Vf?bFh%KHM4X2lhuu8lp*~bxJVlT`Chn7{0gu~^({oh5_(%4N>X0Fg|~G|cFv~w zojcL`*OiynJQfW?57?b|XPQ3;)qV`k8Zc92oGpZF=68#TlCL$zG-AlkC-lC&8=`dl;)JN<@V_?cF{&DLEL3wzd+Fi?BT1l9%>=8_As>{(P~s%Gta&F6qqo z%n*Z80iEl-5GMDw$qMF6oyx#{sZ3^emrat_^4_VZ)SKE86(8SgbIJYlpr$x$ustZ>eoXqm(`Gk6s|5;=8v5HxM#x1Q1x ztoaJ|UuxvaXg8*}Q#87olbso@N{fFMCTpKQ932(*e@0Cwg0rW5gM}DoJ(9YPBGHJy z?}`|P$Kj#VTY#zugwstAW=1rkdZRvCXJ?Mc67yy{5p6*PF25>74PF8bl@^Wi68`(O zWXlr@?2(QT>974Z3`GB$)e{~u#ae5%> z-#Pr*IWPV(d_J*I!MYh~NdL|~&tt@&sQyPfX<(0x)TIA6?Ef)HqJzC=qDE$6`19$1 E0RD$aF8}}l literal 10852 zcmeHN1zQ~1wr$)!c!Imr5Hz^E6Wj?7jk~)OG`PEaa1Abj;7-us5C{&hlY8%*nasRj z@NRuyRdx5Ny}GMwt$nr}MHwjQ7XVlQJOBV71>6RklZHY70En;v02TlqQd`8%*2&b? zNnh0+Wa_BP^ zqQq}tsgPb0W8@ab05uHj4);jQ+F7)E|qQoO1^w&Ly^Euz$qibC7-EJ;K& z$VD`5nG=nw6XFf5(a|CM*C!xIN34YVGy!HQNCM1Z5O?kQ2#lMZG|YRco%Gpto#`Zz zfu!S?bkYjAQKZ}Iq3^cM0@YTrnyC^A63pPwG3W;38sDEXjSWnU*R7JFn2OQPA&^>_N;2-5pJ{69H)UoRkWq*I&`0UFRboWI8zf58&$qXq-VWHgKxzymz}F9@aB)p zzgOU@dI}@I$d0M&AnKy59Lqw<`N$$kt#_>Kl4DRWF0<1ArKzvWetU)2DX?Ix+UXE61i zpP>MXf6;HF8VmU)ctjQ~Pb9E@^&L#D9hsSa9{;P`|Hoqe)2&y;%PI7+AcvevK7Mknm+G%IAg8%zbW;T}E39%{8B){CnRAdZneIEET=We^AdFk~XgU>p$kE7b zvC3=6n)K?HKuyJ*+oHxO%Ylc?UEjpI`@3j*7uKT}qkP7YG8sGO4RBIokTU1WSF3^j zd^FR&pA(_4eCTXAAdF&XASGI6b0S#Il6M23_2&kWq z$DdFL@xM|6oh2B`G%yW>_p`XDO>2@8Kcd=HBu+E~`#-;JPdH$p-*!*4ZFw$-_#~U~ zsuAEjLugRk89DK6)mB%5-N|*en1_PPaMS$V3>ZclA+2wS&QcVHmL!K6?yn!>u7f2$ zEY3%$xs32mmQiT??TKV3bb>BEm3F{l6@rMk$INNZpjwWFX#FMTSYH`SuHDz&=<4JU zx8WxrxqI|$eClh`?LO{_z)Aj~uekq2Da*ZKQeCjmg@6G7(7;>)Q|ixTs!&z3OJ_m% z$!>TyxaEUDc9MHhlE(gpPEJM5^tj$Q#^BS!7>8V$#j`hDRCFbCW)rS;cI;{A{mDdm zxnt!JmIih7ggHAJEd--1=@n^yw(F@`N+Q>!b5GJ#+>{yUlFs-3oip&qN6uYIbiuYL z^)=2>5>Q@1T3d^S;Fg?)^*A5VF)1h-M(-nH_dK0}p&Q1GjwKHX!?6{Mtfn*sr+C9_ za{)gM#T9bzx5ER`=8bMBI^)N+Ht%|8P=t>P`^qnmqTvLTD z7dhFKp=UHuajM&)e{iVX_H-Te7Sk$+eSFy@!^#{5a-_^QsOo{ck0fIp<&w{biNt*w z5PtArD)lVv%ESW!scK-ree|qKuJ6qENV2)@^KI$X@U7B8Zw4>|>*CE~w#WB;2CpO~ z9A`?>z+vTCcGOHxGArn&>$xU1;r% z?IKH{KJZlcD0naSH=}V}t*WljAzeH5f=`i_qUm4O6y4C6anQBII6V9|bLVSV z^*mpq)d{G?qZQ(!KO(jw@Vxqg7tFK2JGaOHTa5&8^d3X-GlKucL??4o8&l?AN7kRN z?MQPnl92O7C+?j9vZL#R)jMnEb68Jnv*jZy(!+x7l19_my0rKB!)>gjit)y|ifPGu zh#gmoOR$*xl#fU7xAP7sxAjY}0H>7&ZHV`L~ys-#R$g6WDQ?SEEZEgH+@- zrd`T$>8b(~FbES&mR#&m;R=fmLH_Zqc?>-^uKq2a7Wa?WOnHnLn|QpR1va%yj>Uz? znnq0d1K#yB*Wk-BaNNsf?DD4HHTY1mALfmD8}`*+ax-)C zl=@nD1d4Z~57br4^(b!?nw904lSd7~%y%#5{fZ-c0QTWWpM2uWz$88x!F-G{_N~0- z%eG3}$;ny*nzn|8lh+45Wb7z4<@Otdv*PGsv3IvUxe}23f z_}VLf`kOlnRJ9-msPod5ce{+N$-9P_S!{;Y8r_y%YyiJMq^}M^eeVhICv`UWDo{$Ii zil2gYcS6nUXvKhaOwPT>BnEBTcg`4()aX(WW1;E8N#qbfPnX29&$;$nyEZL+6pEX> z&)`NYaiOCS1G#><5!A;)yyLo=13!8PhVi~WcVHu0)`as4N>GfoaVQO|@N)K~pl0~E z37%o#u5$#Ye-T$t@B7IG(t+f~RjU58Ln8dbsQPe_Emm=4*es=pg8_x|ltpMidP;CK zNzl^S_!?vJXo<}npO*>QX0YIluuoX&=D?^qlcF^nwEpWnA7YMb`&fj}$3{{q-p4%p5er9< z?eSo7+z_#?qr0Qo3l0x}*2xy&uNQ?uSWarisuW>fKn2Fy_Ndfbex`kB!$Kg*eMs6M z1-HtQLU`-5{7ikBlhzC`ZEdF(Z*hSa)`!>g5iuY2k1ZdFItO zD z>=ycq*JL`}-xC{FJ+ZsCf?o?TsHORIZO?W{mwt0SD2zFsFZb@!`sQqPom&$|Q+2m! zmI9F>F(vkNF4kD>?H8URVT&uyz{GatUsw@Vj>a;(w2Q`q3Nh7!I|EzkQBIhsbv2Q* zebqXyOpn}JsE5ws5vaG%sU3J<OSVZffe}$o%UB{5g>&ChLR4 zsu!QusgHR{1Z=bjNqp18SYR|7`DN3JY&~7Nq09YJkJ*o#*E^^X5On7@wTS2%2-L&pmvvgesUmDL#+Mda-#6GGwO=M z$%g?rJ4Xj2{WA?jN@YF(ZyTz|$eZRp4z63hUVp%-tyD78@QAJ`o6&t%=_K)lOPRB^ zVavSmn~h%GIto-Sn&f)C9!5m{SOydKP6~e{&ypv6l{c_5_xp?d{5R`qjCZt^L4{7~ zhx$g>ndfpvxx`d{QQMK*+(vXE@qLwWeF&`sWN=6~4Tv!vP?Y%!yPktsnLY2>KD11D zu|uW2R=G>gr>y;2Ih}XYhMVfwm=cb!Bons^MH zB7_hjXYou{8pc!>K-h%OhL*AaveUCaG{5*|fBWJN(_D_m_*qW~O?OR!W-UJ}NGBS% zho8cFw+taXc%Ne(4{55YJBNwmNh&|2L+ksC=G+4m{^HC_TmaFhQ6#^wS&$9WNT6mc^9KPI zOwnMrQDo9w!?TwYz#Fz+16wXAx8g$K{XPWwv8{E&XMukd&Yt`v6((3XqW`yW*nSB| zUH7MOyuta1-$R3ijh|N&My|BM>>Z26QtR;|E0qS!m^HP<^JA49e81F}OUi&7kk>>D zu6{t<9+@kLs^q{Z{)H!zgSnJj@5n6jqfdKRr%;UQM_JH>437x2YF^vfR;a<~r-U&Y zHB^>FhY(<4S?Qs4{1oIUQMEY&JuNPbI*4Aqssj2`xO>UaYUf_`Dr7z8EV7C{0-S=1 zxcS&{YffRA6!ds4XqL_WiXLvr?Z`|0b*c~Kh&!>a&sOLM_&SkI6_#eY=J0S)4e4n zx7*0wM4B|g)VTaDeTSz4f3&j{xTt9WJ2_Q^jk)M|uaK{-s%P1YP)vqUu3aM6t&7KN zFOEV%m}y^~t9&kwIY-n}18gMF{U!|4+WS>8X#mzMhq3JRo7{@s)haBjQs^W;RfTn; zrl6fGs!mtl4}j5lY3E6HN{1mWI=zQg|ETV?(V?i?UPF?bIsYGkhKd?n4qqNGIi#@9 z3*3H&PzWv7y+Rwd-3*JBjICB{n_4rv-_hp3o@n`Y?FO`W3_N+VTh=sT!&=}?5_kC= zcs`w3f_7s0;cUy2&>q9>%1;RuIvI7IGeaVua20K`u54~!lO2%gv9h|wBB0pHj%nLg zdYSL=yU7@tpcjGgaoB~fMlnz>M>i$k5qPtjGd97>&Ti%J9h&6`aT-z&1;z`}>#nI2 zv^#oIIPzyOxd3yEzN;5a7@3%|&S*N(HYn>~3(z!Li&6Kip;})_c;T!hZ<9c|(K1g8 z1}d0*X3ar~W$ep$3}Hl@k^&hoSeD-~d>nu5uHnaK=9gi=vFXDaJ-f*;92IfGT)}Jd zfSWkS>iFWfyn=>&IMBXO1KVYDGXQ+UZMJZMo(WCtS)zYfzrk_61mfE`7*^WEevJ7P zku*JQVF`=#0qaUM@IACjz*~=~)RunivAkWTz-ny7{Qkopj1-5eG(tYn>ve%m-+$DR z3gS`%5?Du^|F@2QWm1WM(vjOz1BX`-1tXa%gnA`fLQ&~*)+?4CI)it-Qun8xHVJX) zX+_Ksdql(&E!Sx>plh9^0a!M9b3V?@_cjC{+4*yZL$>6X@jdh)`CZuXy#W zv?B@lA~q%RokJQM0z}65-jRxKY2zp$>5%g0p;VE^tEiJg$TZct^|6f16qPmTEN5w9 zlQov&d$}OMjXTDF4JtZBoEJ`D(IJDla~C4$G%lZ(*{AEznj3ePE{CN_E}-}-_C8HK z=?<+t4cNfYb6vlc>oi!2Lw$)Yg}=j&OP*8T@L6TDd9D&h)-VnOMDoR!C1Y|tc#E3t z3nElfb{DkWZBGa3myUuHnZ5SJRr&6Y92^M?sU21wXBomG`ZZV@?)Ze@d~CSl0$99;>B%b z`in_0nqEVTZ@EGbAm&)AlZ3FT5C^+4#0K<)TYfihzn(9*aH!L@^Y;m0N1aFX*uU_J z-W5ragdW7R3ek|bL`}Q9Nly;DVQw|3#%DPCkEmvKvs?htLoVc_Fk5*L>&j~J}B14SUyhJY|(U9Wu z?HEg>baOwp6Iq|Sl`Ri)Og$G#;qJ@94T?|adG2^qjVq+m`_8F*Rp#@JFMf()SCi>PrB+ViC$Q9TA0PFwnB^D9hjq5ycu@4qpW~8L}L$G%~fNGO6PN@7D&HP;%Z=T{D_ zzG${nHjd3$JEd)eCn*>{TU&BDzAsu*5gH4&EDaKfo(|9eT}^OS78P*VCT#e;fJV|s zr#knm!I`xZgr17kJyer!=&y^ne;3P#Bc7+Veo`=9uq}9nK2X48cs+sS)+Y=X{Z%ctzTWMRBt?E zOlr(=$eN2fN~Bmx0{Y3qCaNe+oUVerEwT)W2`G4#;61g?@CIhWLBQO&sh4N4gPW66 zut)e6Scj*EQ@N>U*~h3`^EGKrt~!6i&X$L8w_1veSq}YX7BcxmLaSb0RAr9(M}alk z`_1~to<;$kk8JxrJ}fNtHF@&&$i7{R?b4muZ)BW0%>6Iz{^XbP;8Zb(r zM`IVy=dF_;cF0aVQS%{vDD%d(#^Kj{8X;yAf?dR3IedGa5jZ{f6v*vgf$$w}Ldk0m z_Ni`SnBvP~?JZIfCIfu8oF|4Gp_^xdJQu@DKq2mgV!ccr_E_PB?ngO{r&6q^TZl?1iIZk@R zco>rssFHf&I=04_8C9$F+va@*24a+?;*91EvP{fmdMbLfZi;FsBilKzWa85) zEhB1o!Iion6~arV>ZS4c_o~`DY)`{Yeo*V#TbRgLV=OBWs~!}2k|yh+ecLTt7|%Ndgs};tuCM9}u&A$9Gvf|R=g_S6X37{m==<@l-7i2m9sYSv z(<$3zBM?-<$wR?yqt~~r)+#Qwk&t=_SLK@&Zzb(@5PF&!U4K7u$U+nQ?ul|j2dE$= z83;oeiEs`r>9M}|OnwV?o_}Yf9KvN>3Yd*z;NT1W&ulaVf&NQIFbn^^WhC_2F8^%e zLj6DuIf-6*M;IcgG*w9Gu}}k$YOAOEQmd$mDLNE8 z&a=sR{n3ECMtN|B;43?^Nrjc!CdX1S4&-oo-2=s@BsyIQ)3gbV_@J7^vz3^`jDHXyU==Wql(fSkA{t;jL3*@E(Y0k`3X!(8O4@n(V-}Whf8kIl=_GV zsImeLmG=0J2j;W>g&&ZdeBG?yf^%zhkBr8BSr;mF<0~6uU_U>p=_CEFgf##@X*;Br zE9m>0`w{>C&gbqPcVj!2fErNDN7ro#-#P7sZzx}PGTaRC+$AyyvkBM+(*?T#9gNll z2_0Lx9Ufm;sERy4)rZ>vH~V{Ec)M<5oPOb7BM8{T&cL0G>4L>A2qWzU>FpOXD+C#=XmojOGp$Y2JfH33 zvX_yicXE62#q~&Aq3MjX3Ugo*J%`IhD7w>#o zxoN&uUq(PJzTK|77gq0F^hwexNbNCjkU~!PYo>AHTzDi=^|)M~Ld!22s#U1<9i-T;0 zz65yY<1?4u-i1&p=s@$tb|Ij{8VL_BKvO^uodsfwZ~qmbx_PM^ja>h(k1>U@_Bd9L z2s_%4AwTg_8nT{-vV*vr>Kl+D)$|pGwgf{tFL?RPzaJX}BqKOZ{pVG1|Iyrk&i}9` zPEqEs0DoPE^B=>X=Wg&<{%KLp?}mR}n($}C9k9Oy6X^d~sPH?^@BQdMkhs7hEg0v| z-Ra+re{UiFVcd)K%lP+(;_m>z*Khv-2*Lc{Z~t30_ji=vi&=l5bYcG^%CFMa?h48;A#P6nm4blEE1OP6G z0f4^+Y`>fTbx!@WxgXh|%>O;Z{%-xd6Z^wnj`G*<{)B^k-Ux0+0q%9h&;* GyZ-^HamEe+ diff --git a/pandas/io/tests/data/test_multisheet.xlsx b/pandas/io/tests/data/test_multisheet.xlsx index 5de07772b276aece2a8fef0fca7fc8018e1e4c2a..dc424a9963253a1b3bfc7432a50b061755aa2dd0 100644 GIT binary patch delta 8279 zcmZX31yCJZv-QDUb8vTe4esvl?(Q7);1(dj!GgOx1PvY_1cJM32qC!p&%Iy0`~CO5 zsi~S>)ob?D?&-C6cdtIJN1b>R`F?PA?p*ym6V0mFoyt%nSVuBZCq7N`W+8NYsH)e679UVA;uLX@4Gcwpxy zaGjEF_2G2{N{@@o>Mut&SjO0qt0UZTkz3)cAm{r-c0Jc21loqodN{?syTsOTpZnJQ z8n`t()jr78oA-AoDS{gTB7bgO+r4ftDX zTw8$2Vt!m>9~a=*mo(a2e-jzIs-|mNV~;2Vd|43`(sT^dN%|obqgq6Z-ox><9=V_G zIrfF*GA5fzkwyG{M^z=v`v-u8IUJ@Vq@1HZvQ_j<^+R&G`IHlfTyrmVAnnLmzMIi2 zmTbu@8OZNj!$oVDMl|&K=aV-F6j9>xxrBwIQ_scn1R^M*_mB1*-Jl zH21@hLWAQ|CKX^fFr!byPBDLO{Adk};a>fK*d05EfDc&~{o_JQJ=+-+5MI#Uus{kK z54`;G(aCp~zvA7{Tq9ottAg4dOV%fD>5G_4|d%$R=uqjnZdd_??YfagqrN z)tCGq-s}4Q76HVgV*`l7_1~=G(PBgGr^tvOeSh|R8z{&y=3VT1>MkdhpDHY9CAP=0 zP+?HW@O++jcQU>3eyD1v_VVqX3cN0Lv%|hfT(hP@-CmWOd=a}w0svlLVF3RViV$)V zcsM4ZK}w?LpYUu@;C_X#?3hu1JHutx#~Wwa3xjFE#5*V?NhQ0xijR3^ z#tsepYtrj$ChD@2Xm(b2aq)LTD0v`^WU-UUCyl<#>_b)$PenTyPqFX(H zUK25e67+JGds$t@l`cIStnK3 zz$#($1Jin<`YLiJo&G*68szRd>bScuIyVp)H|<6_;w{bm?#X_uI6$vootvIUFv20{ zi8-2rIVgaWh#7v>T-dyt{KBR5AV?5=Lt$kK_$y|>!^jN^_|O2r7z_Y_@kU^f6PvHA zhogn7t0ODO*(qMr*fpCS7?j`gYIdJ%hFW3|thf+wk(ipwM(cz}_pWn4n+I&PAb8mEh&P0RtER{iN4u>NMrxoaOMw^tpt@LV$GrxzvFv`>VQQTr9_ZOaUy5fP zrKy+o0koQFX|-xIs{mJiMO*4g{Igzr!w~${z_qn3iZZ7wW37#Sl>Jb>8{?yf(b{Tc zaBL@CrK0Dd`Y%7uUes09rYoE$CI$MOj{&9;LfWK+;_{qo(eF!l8Q1@i07q-xU$UeHWNq&q*rWS#7q<>FGu{NzCUIht2Cnm$8a z3a(?Q?0U7(P;^P)k`>)8Lm!C{YXZ>=utpWI3>WNIcN*5Q)T0Zw=>CB{CUSt^5)Bi3 zeTZ74g<{y@mcwo1)}to3%l>1RyaD9uvVim;|1P?+jN2Fyr8EZ`M?#HVJQVrU6;{9W za|TEG-We0~jf+P`yChmXo|;*Ly3-);*tke5K|3B5@Bn}zWE_PA{E>(U*h6(L8virz z=i)fEK93;{qy%N%xtf;c_q2RYJkcb7mb(v;cA8p%l!OHED_krQ0<4w;4eNfrK7APF zwdvL}*Qce}K^I+N;M}_6@MZVG_?SlJ57Q%Aup7bgIUsro`jLKFQdw{-ut%4fmdZ|Ml!kW%>q~Y6 zzZ5a)uDv^P@w+pk8H-Y_i#Q6(WM^r5*fu`ecbo$YV(l%rx)WVMQmVX|D6E9X0iFeB zY9Apc6sabqKtjWj7+&XJw&N=BcOq>paoFf=Sw8YF%R|p&VCj|&0`;CjoyMcl-f>ng z{7J^9Nsz#}pf~a2xj_dQsUVPOvDUl0F5?jb;3n*s(`2nuZCkhVT|=!7naPLW%rz)C zSU8J&{f&E&&h*0b1-A4hmnE?NKoa=Z803p`%Ktg;)DDECaK3@tqvPP6lNFUw5Rzio z8p+yWG7mOQ;Eh(o7iJ$Is++CW<~(o&jzw7iFV5C*8MJ*c}0KEG|tHvof$A5Z2X_2YL^L$5Ehw!O~}r38pa zc0YdYFRH$RUat1}@?Y-<-@$yrDK11NOTwgE@D8+Cx1fkuhL^ETj6ToSM}?N+MgfMn zh99wFaBp4H6ww+H9rbs;3y;VQ(Z##vu6rgoakKb!Z-D90M>TAhJv0n3dKbiiMao=M z*#W);74EoDcsc!El}%1xIk7-rkym1U1~_uNY5?82hp>L8gGOxdiDY~;Fj?wkt$@uD z%I)46dd^bu6a-0yyvYkP2-Fc@n3Z8E<_(+@PxPE+l#5TcEJ=`1pw(gSp~?*K>xky>bX6g#1@R$zjUT1I@^Nrwz&-Jo?;G}q=Y0%)mpgLeGwOCf?6@x;PlU*bOTf*%zH#PzCB5xdG3DPkvpr!CR9P3tX&92~59;b@H;k&hS(USI^4 zqHKJfA)i?P`l(Z#m6uJXtGdrmH{O0MLj^MyT$v~B`!9lP9nX1~3Xg&E=jN2YTS&kH zcMQVFTp|CRCw^Xxt#ZB5snDqGVYwAOj@^7r(kmG5==FeQT$M8UpZ-~Tp6Pv+>byij zC_=^&vQ9a90%O+h14=_w;(_ZX9^gwKevUKX@%#c^e;94wU&LEp_UI7q&J` z>`Ik7tbATQg&jP$hm~uj^k82R>sr0W=OO5def?096u8H#vSR@=Xc;#r3E)TRycw*dfKS;kL_7LrC@0(fgspO)z1TWI$5T~%*@NsGwT?`bPGwWJBEm5nTAXs z0EzO@`JnIi+#|3(3O^-U^T0%r3=re%=7Jnad%>bhfHjUZ&LxnwRH&RP{{o+0bvcuw zKtZlaf>&Lh`Rho=*>pGvOJk}jLj?xu#efBAU zxTF!*eBAabF_rSB6)CA3`34 z&KSF8K2P5uCpc8=x(OWn7j6d}ON?AP=9-N79wlr@t_q}^l{0H!lhhlsfvh*#_)X;F zi#O=!auiDU=M1jvX8kjdZOLIP6RpHU`4z(+U;j>fkX~pyPom#a9$yF+0U?<4?^H*} z@MAVVFu1@xn3AC0CuGirbnK?I)CSNleN{MWz${2yqPI4l-fv0-fSc&rJx{ba6B z;T3uN>7LSn!Xy;SYOTXh;4&-2Jk9sGG;Dp47{+qssQKYk>mmCZ$i=u9^B9k37OqUf zmUeGAMGCcbAk7l%L1dLUW1c$P(>^bYb$EL7X6Tsx^%o{tr9B$4nptLTHo`gb_m`x6 zkrlrLVAf>=6!gPu5p>V|zB5ut7Yze!7o_TWHnVSO$>D-?!l0Q2!IhvU_Vm`aECQRY zqlZA%d-1%Wcg(j2F}7Yp!zhE$u%lcPYW#&~E0;00%L4!)CD_Cwi^Xosck>4SZQ6!x)SF#r`=75KZVB6RIY$VQ{6}1RlbtXiTAKfDqUUdL?y=DRgR&fhahfRZG!NRoLdpNYo=u_1Vr8plGI(Lhae1wuKPky@1ARmGjBdx`x##%2UxWwIK_TGA7=!$~K|K(0zR$CpGu_778C-4>&B9?lV}Z&#z$dY3+xC z0)wa8r7rVx@EHPHjO<^1?6%Yhre`t>DfO1;*4JSBYjP8>m^^hgrVZpXju!c|j2-918vwf_Q~ zj#f=(_*=3bMuE%bX<4X8L1EG0e*9GJYx~09H2TjtZ9#=*)vhvz?M!Y%<*`r7f2L>9YNts=h<$AmY`8|yZIiuE4HkNT+~*FC&UV$I zl5V`WrWNI^V`I8}Xu`g`pALAq$_>QjSVP&0f6_Rkdvh`P=?A?Do)-xk1s?y7CWL2| zN%M^CKm439CwTUL`P{r$|f(Gjv8tCkcY!dXYcO>Wl2Vj zuhhG6*;#B#SsDE!7%x#D=sGNv2tw4pRd#4p<>;k@OPJE%Eg#S@^+DW{-@zF-wpcOX z$6gl*H5;wo)Z~;z>Qg9v4PXbQl%)Q6Z+uXrh7)5(px$R%H>0s&_dW>2NagEn=O6XF z3_zn?vsSmA+U&Q(J%4>Saa@958v>FNj!x_E!zMKu+xKFB!{lgV2`>bi8|#Gt@7$4* z*^i^lz@u|H?;-c!TA)o24k)aN{HA!>*HI?&r`$eA`SrXrlz$}vTpWJWUhr9MP zdUa^9TOkq*NGb1RyoUgF5I;e~eCNtU?~%!~0OyMPkLtYgT{{I$60MLzvt?@OGVFYU zH&z_F%u2!4G`E-cERx$o>ES2-Ze%7(JtrMFGPH&* zv`@b+&=CSC&?XhWO`ka+YT@iv3qGnvlaz)m!w3r)>Tj)%JXH4JOx7XzPb-fW< zfvZ(Sl~Au4sU#3oScSo*ed|bIba@~N-czMpRwWq&{>pm^amRuH1 zAbnm+=X_=-(8YlCO}0V0vJrOEN6z3wP|gGTPsGYyL|;l^^Wg|Ld?6zznXRdkq0O%+s!6f0Kz|q%ig_)v@*V z5AcbXuYpr?rpwlze+?eK!A9J+dW60izq`Cm7|FOBY-Fy1*$!)RFb}X?Y?X`Dm#ID; zczHSY;cjN3P){oi)3!1f#WDxWne@hy5r8oTB}(8B)AW`E8IC%35x-E;V1q4nAjm@R zcWF|l_6=p{atf3%7*R%i*QXe;YMsK_55VF57daM9HXzd!1wu>}Wa`k~rD+3hJDL}w zcI@-$AmM?U8Yl}%6vlk+3gVR*!WAaDZQwk>#30 zgpicNUtAzoy1qkgx}@mZ15R|Ji}el;R=c1Va8@k(p{s$(EDt|sWyttUG)$_sMUrVG z1{4sZpkzJ_Eh(YLy%C7Oc`7{!p;g+-vH8`Sz3g%ww(jW|4lA#J4zck-(3CK@qLzJ> zXdPY#;oiAy-OoZUO>gdEr;b$yl92WkOP%wCYh(KwOycO+`)WcC!CQ`LTG}MkL?j6{ zHm&aXDr;&`t}#h%THDLG8z`gSqf&4!8(K^i+MC~vi2j( zK|e%O$K=9T0Yo2ljPnCd6uU%D*V;}Vo*0%Q~Sa{*+k{ipOZU?*sAMTX)*}Th195^&8|9Rqq z5frOfMRld|2b1oT^Nko`g|Hb?)sY`gi-AFzf7JC5hzs7~D9MY{rxj5x5eBEQX0nO( zj+yRqo{LPc)Ot)dHt$BNnsTfQGcKjv0Z4y+S03K)@0xQC=Ci1WZ|e|!heBj4R?~zY zbIh*&eIT#tkjJ|FLJ1+9$eKmJ_>Y5I_7<1*Z~m^erSA_-O%>(LU&?A^c}Y-RQa@WI zG*$OIhfStdYIJFp)vz&TU#Chhre1$53-3>8O0QA(;b~WHP`5cD^kiSOQ|uw{DNSoY zPN^O#~1MfE&71pz-ykEI9{E+xmPmei4kPJk@y_neGj`8s^n%J@3 zrJzgl<@dE#xT5(pCs zA$aqxSBDw=)~oBk5agLkqQO#QQBG#`o-gs4F(F}()qWLgJxo!4-1LQ)s9}{ zygWGV5ftnekPFTG)M!+VDvL)Uqm3$7_)U(NaHQamC9BGQ;Q zp(!i<9I^}MnhFzLJGM@i%+0CC=8skECEK7YNQ($qRF`rukxedU?5Z7oq1>bvdDxa6 z(-oVjLsH8>>r51Bw0#fN0UWF92{Sw^kq%mzzr(Yz!U=%J{HCck!cmOuHf1JC zO2ls>P8E>JdWx|uDEU!6M~OG-q)%wwL9>4==bAtALaSi~N`25bpjTkM&$l`kChf9i z$EzF%$1H`^in|~u1?hY%`pEGa`Vp)~HUC|rbCq$Bz04#6f)t5r^Kns-6dbo;scP1( z2gdk-ejW2_;maL>Qtd8Sq)e4~Tf4@o>D6Y1pODBsm1B1<5!*<+7fnBS8{8b?P7Eq^ z9*f*{u=3M^Vy_<5Uz}>2zdxE`5XpnS(5We_b^EwGiJE_`TC-CdKTwryR6> zoM;oxdgNN_U)Rsn*Hd={MS<69M49>cq=}gat|>E(o2d0@kCclU+>8&P-99uD?y zUPjiBH}HRXaU1GG@vWf_q0@-uSQ6y%3qQM@h2Eq-a^^*oLFiHA-b5H1&rU?#%Q0&D z6#9*E+oS9fIodS{c$YMinj-E2VFseAlc-4BE^z&OuFxKs1$*KeM_#UiDJZga&-S0( zZt1q<#sCTcuy1}MBZmS9rc5gg{v{N_Z5HnJC^zk_NvwuKjALNT-wkzT=C};;vdFg_ zQQZ^wWZHMlqYsVcyo!>&g5_HHfgQI{D^{g(SZIhBVzC-F4tT{Q$KUfQCiUv5Cl%p1 z9Qq2RJ^Zj@<6<=i!&#Z8cwu(mr*JB1ajX~`a>OSrN%kvm5reyqJ!4OWhN+_mG=cm zb`zI1u6DLWGM(5Lyh-NF@do$H@BDȕ$NIk?gQXp(_Sc3Z7;qa-DjQM3ca`T@N; zL|s1RDqKl?$~sN=4}sng)YP^JA%u6^o3tax4p(kTc__mVgmn81MjI}=^q(;O_aU*W zHiiPWsqEN2Ed&!cs|C*m{IvJ^{MQWE^6fO9xF%ejL|CdJC1XJ^?^9g=*Y$kUM#jQ6_!e() zXLJG2!5Z*mc9z0XYYk)pFe-74%Gd{lEDkZG}IDk)C%9S&kJSK@JQ78x2)DIq8vDkg&A_b?^eqjtU5fa7+h1aVh|%DIjpmdc2H7#;wt@BFj2oiqtIZ6%i%fXo+h^kzM@QkM}Ss8?>~! zc}%#-*XnJorYI2;>2h%Y`|pjI=G2{1yLyye z|FN2eoC+l^fg+qPNxTnf0!W8gA<!P3YkxEx}R< zR&e}QCs#qUHM^Up(bJmh%3xhy`lmQm`|RoXxOng@VkSP6Be@wGY=q5N`X+)z^SeWL z*a&QPZ=L=kL_HvsZe}PGtPzEo`gnuA1uS!%*=!Qhk_c2`O_&2x3bMjLN&HW!!~JhSC@ln^krJMW I{;${mAJI!bo&W#< delta 8036 zcmZu$Wl$d5vi)%P-~@N)gCM~jf(8o~oZuc@1}C_DNP@e&y9alIySpV293DCM)j99p z`}U8iUAx!RRCV{--P7w{qf-q>Sq=vF6$l=L2m*m9K>Z#`G0s&u^w7XZxgPdc!Dn#K z;)7+i3$$Mbnm@|7{qXyR(7XMaMGg`J89iC^O!g*M;ps#6n4X~C^Um;U>$zy29Yhw; znB6YTxYC4{(pLZGKDKH$ejq5vj+hfz5ARqHWwJGSHDh1z;ESYAJfcPD@3~H5id5G* z@|K-TMN>aqq{C+{Ed~YPKoHu$I&G|_^+mEU=bAUaK@8G7j1ed+@|4>YrIMpv3XS|Q z7ULPQf ziB~%4NHEoi0$`+pHlK%~r4@n5!vTuln_Q*g*su7h>Rro%lkOax5g2J56QvwVH+r#L zX0B#_f0KUaPVdqd%~;Y{m?by3MkO_UAyI`n&ia7}2`7&v7*FtFp3_`;pMuW1ir$*%`yYng(6r*uWk5`J0oE8YTW1 z+k-=fk07VTb4sjddPni?)xgYV6_P{d8VB)5?VSMs%Cf_^}3X@5F~z2yn>EwC$d3+w@!l^<5tFx()>V zAT}sy4j%;G$|{Qeo=mA87iIJ6@j5EESUp89 z>^G%tPCw7tD0`dGY#bmEP$gCp0dFQmliA1`xcKSP@XFRvHq==jSKbniP|z=L?skdi zKs04V>gcs7$dpr?Z;m?M%pLAcjaf5P5M3Vk8>dDtopEk>O+zNo2RXLON2iKUVw0p4 z92H#?rVPByTx)Zgx)??x;E+oj8Y2G8nTI#_Z#Y}LIX9YZF^vGKT>Ap___$=c2Aw># zB^mj9f?gP~qQ1zSKe3JT!5HSyk!vZXITk&_2z^(Z z@5gz_-E;AW5J8}Vt6BVXV_1)%GuzyBSoHy%UK!bDTu$jp#ralg76WfuAMmZ_ve#zo zJWaKob(I3(Pmh~fGRrlNJ#qy+Espff$K9CxLl`gR?{720wk>q#qM^}$C)JPj16CU?|%ng4I2ynaXXkJ&A&z~_+#1qz;_3T0or}azLme>xUrvVm+iF7KL7*r| zD9URfoS5fTJN}(8s*~%3)hBDV3wTd!v!!ENilf}^!aCFF>f~6$!4?h*W*? zY9nE$$z$S+iNNHVF79gg0~wPo%^~ z>W55){6F=wRT3($F9hasVB4oC4sX4B9kB8>(JJAcauk-VE}baH*Ea5NF<36aQ@Jf| zT&o54DTu7ZQd}!Z`BjtRhW#Pxq3bB5Q$P5`YEQa(ja>wKU4xF4d*KAkTX3)T5*yfn z#7T|S&@eRbIv<#;r0Ze+2n;LBk0Gl6iA8hvueE&~z-;Y4z60<4Cpp_iQiOXwJa_?wI zoK}7Yfsk@#VtTQ{3DYyNOdP(@^T~wP$mL^EucFdKh&$zJ5UL%X>puxN=Lz`Nvrl=* z>2(HHypa_`C(ARTT|5mjn3*AG%kX~wxq!2`Q`1j}k)AGabFjV913?h}1k^`F@zZ}h ze68$Jzs?=`P|+*@nh2JxNY1fMFXC9H@6=$o}N~yP+B3` zFt3HEId;d?y^fa+I7SrQyG)`mrhr*v0t%x`5v=+8uczSyAia;|mOai@2dz2`h!JRR z?mqqN&188_qRdqKp++!`KtI7r^^ZeeCP(Jco*s8_9Y*??^Dt5}az&#>=2^T!}{EU?{pcW&ZGWxOiYt(U754+gK$j~?H*r-Qe zGO;kqy|*~rH>8|vnC=)3B7=SQYn1bd*9&6yIG@!E)TqCBft6UQS|h&Q3bF3P8s>q? z?}L;3srghFl|q}J6@ec$#hJ9;t5{|uhA1U zT7;>Q(V1>iQD#3Pjv9pOs}oN$j4LQLaT=3oJQZu%gkCT#Spv`O(bx05OH~>rj;_g} z2-4Z+6*na=^EO{lUvq_hoF8gkoKu{!b(a-<*t{Bkc?)2oGl% zm7a1XPM1efX$)YMtNZY!R#Z%m2NIYVj7791z<-qk`4!<@f>wU~XU zx`d3Om}F=XD@~HVg1-5ynLSN5)7wMa9hei=3{Kw9D?tdVS_aR~GlDH@a^n`+T0i8) z)lFtk@OtJSt%{su3TGJ|q-MN>{4?J(!E+YM_8ozl-|*YL zDN5V&7x^Vo*=1KkWO&tkq~Wvg`qs+SPqUm!eX~S~>~0Rp#U7zI2Vt+%iLnux#>-O% zi}|h^vMtZOj@!ia^9`jg->>-E6;Irbt)Mr;%<9R2PsjF5+q$`gDarxMar+$o8#Q}r_ zj(iq-`k6Qr?GGqR8V-E`axr|i5lv~X7@k^B(eiw8TmTzIaW^*2blHu_Nuh-A6}*+g(%{gbEY;t?G_@Q)EGU5@C&<0_ zFmMU(m(uBvh?N&UEanw_Y;)belkXi_EGy8Wt+;#nKasT4QH5a*U#kdfuK@R4pO-IY zE{7hE7Z~8lT58t-2VCWY#^yKD3J>&gBeZZ zq=J19J=0``bV7u+0_dji2>IF+Q#;(XZyD z2kJ)RcL`Bj?-nD42JLgN5ui-gcV@D3Kgs3UIh|Ejpe0Dl*lZ|VxO}dn#V0rV85*n= z&GuE89a|!Za~PE(%kZ3J41B}cZD7j_<5rL-w%>zS!+O);%VSlxq$*`TGT5A%S1b!VS_SlP~Wx8hC!kx~pXJV2Wv z9ARs^xA@-eHhedpK3U{jOwN|R<5R95#(5G##1x2&iZ;x~T;i8k@Fs`adBy?^t09bQ zhxm2#!pZ8Z;}Co7zlhG{;em*%ZbR~$S-;<)+R{o} zZs4t#0?L<(c|Ko57^EiaZqaqyPUeL&mS(Hf4ecKve__mRo@xUfyZ)_heNUcTmX-Cm z@D>E)WF0<-o=@kN_B+u+2)4xu7>}WLC1=D+?JT;_X~E%7_{uivS2nkAD1k$AJq`}H zXe2aS`4Ju4GB5KTAvZZA6U=;jLOd?ft6?m(%i#@~Pek4v<}CH_@-tgGdq-xOqCAFl z0|9ZOOdnS@h+3UIshxyUSzW+c`M)&s$Bax&Ieut;W~f!szZRyivzDamT1B_MlJ>$| zPTVGkabsW`7YR@@nc&Doiw0PFa-4!$FeYW~jpr>(ZkQWKA3H0BaM^_9xNdBEaE8xs zQVoa2KVvTw)O#R|UEs7$`z|eGpdR(L&R4>B*xdAi1^CS7e`2P=l6e;DAJwdLUoYAV zt{)D*-@tv0`W}`rHE3Z8kM|YlN+KW@R@MK5N5r?LUfhxFUDkjKTx6`A-lHz8B**e( zVnK=PHQ^25FOH}pFG5gWIKuN+_2Brcdi)=vxGmOldj(RnP^v*`lwm;fin8NUb1Iq(@}$Mu5H%`VNFB~BKm2k7yp4%; zGvA8SlwB-)buG7|2m{FV@q!o7#)csA(Y;R;5?eZWN+^IXg-|wHIYpeR1_hK{eYIN; z`_Pa4;#%FMbZuP9x*|d^7bJvHr?}0){3GN!F$lXZCDfg}C{ep{$&}naV{iKGsPnrL zc>2U#>P^YmWT}KZjFMz)$xYeGB|m|7%Y48 zX@gtBqp1E%#Cup0v8j0tTj^MlsI;rpq+ISU1rLM-%nw z`Rnu|Bv{n7!?Uxkc#`dt39s~#%0vDV+1FBJC^QbJ@u}vFHtMehF-~s32hwEn7+CzE zeImmKcKuSb%$!%^a;;d+uPWeV8ReW9z9C`LqY|tT zO1HmW0Dnk(Muz@ZHy-kM}(rjhj8riCX)0mdyhc7dq$p0 zbG@sq#f5kO%1qh;IN0>JIF_eY_$A0~q;p<*KvR9tI+P9H>2N1#S|iV_cezs&!bbbT zFOmerQ^9OS6U3~i`AJH%m|nw5ZMnksA!k}@kOyY3M3PR0Hv7Yu0Z1rs))xU`ONLApyZZ>GdW^dt zZ-hhAQ1@UJ_jdSV*ZopuB%=fdoBi$!?530e-`*CMc2^>Vs`@?`^x7q-`Gd-(1L@^= z=l8ZPdaA|Q2ZRlR2p8n_&P#j~w&R9oJ+`L6$A_o*_3M@2BqAu+$@iJ{{}*8AVSA-K z4apH@dcdi|Eoyqdsf)&J-*@>hhHyJZ6g5e}jbyJB`(nI8&FRG;?XqGmP&>wRlsXZ1 z@BZ+IArSA^8&KI0eNrO8=B0jMJ_%Vocl~r7lB1BL%+!7vJS`T>p5%JJF~BbI5v@On zQ@B6lhoMkQtGB)PKa2XcwU^G%-}V+UD!q_feh@VTA4*D1qxh3!(#AMOP($y4CZ$z( zLN1|@N3Q z5If$p$ZMP$@CSJI^>l-x-! z|AvA`g0S5wg)d@j043q|R^p(3A?WMXMyb{MqUDP70d9sOl)KepOw>}yfm!hQZ)xos zMG4hey5G6h07mRa&0|-cux=ygewPnBdrf7wVht+LvCyhxV;44{1D}$IuRqnRLz7aL z)H|1QQm+a3-u}_p#q)XV^tT=A=eH4a!98em##P3l*L#}5W@929WL}wqd+lL(UH8 z`3;fw`mIY`uuwPXRLP2CpWK}(ES05MLkucs^;D#@)=#=!!eT_pa&3}g0Gavez{xoiV zLX?>pC~3q{YWW@&Du&*ND*fi832TB@D@;-C7Wo+o>D?eeyaV={sEo zLdkLlT7cuOTEdz9`~l{5D$93z#nZMDTH~d9S$<-pz=u~I6&ZTL>Ra!w8?b%D7inP z0)C?hpGGc!A`TXLKbc4DF<%LiZK>svoE~WH1>ES*J|srTW#SpRKedmws*Z zJjD}emZZeD)&$=q+=bNe8I=~@c+{>Zq=ttbxEN$q=RjB!QVT52B16!Kj~3lvY4ni~ z(dC6}%Nz*n4$Wu$^1h;c2HdPay!7sv9;tQv@-DQP##c7R;9emL(?_OVX=@N7kgy$G z#T)o*)%{p#e`lhz%iY+H-M`X4s?l{D3YgV_9H4FPq`DapxJ#!JXAp7rr-*cb+E}cK zAZ=S&Z64F?wE3RjYeH?n8@=7Hyj?f2&ZdP{iTw9)Q}Jh_I-qgwY_uoUlwAe9r3`RB zAt4l|dp_ab1-KJDBYWREyl0gI+Jm$d29xgRfd7(U{^Wk7mwiv@AIpoXfcY;~!PwB+ zSlryu!uC&}^ErM)G2n#^&nY+P3ePQL3I!YiaPPdJy?ujcM4^L}jV@1YrgZ2V<}!R- z_EOXJPH%rsyB_N()xVbSBXji}s|AWetgLJapC8-+x1Eu(AzgJIg0u^Ln_HTW4*t2~ z64M;*o>c!TLc3jeFP!e#$kT)sv>fhJVf~&TB@2&yD}k@+%F62}Jx=)@=FgO}U7-bp z=d6d^9M(%mZ(?i4Xf`Mf5CmLkv3#;GI3ty~e6p&P96Jd~^1;;eW-^uIF@Qk^)D~Em z|5AAhKQXJ$zFgaK*#xfbN-;CUThvYv!Q+Nx{YsRPj2BwS9@dKu>pW7T)3pjZRFVcS zK~v4>{kJc7!5a95l+dD|?^jJ@q<9I)BWmz@Jw=yZ)hfI^LKPg_c2efOG?jN8zWqGR zK&Lf>f*n`M-{OO`g!M4lp(X|QUxO+d7CSMh^zV9Dl31!v zqV-5|BMq5z;xFGp*U;0nk#*7@fSJFUzNXfZW-j4>krDs*)`Eg&0sV)x3t^`hfcr~U zggDUSLX|^8=t+tH($!v~ss#%I5xqq9pEV8Xq-P=iPc7*mJ1qo}fs**|x&F(`IR6-5 zc6AU*23nH8$0;wOCc(ei-LWC#gt(AY23(TA|1AFtRE_%=X!8a1cWeD$pgV#W5D|oh zkrA4o1fs>rNRa( Date: Fri, 11 Nov 2016 14:59:26 +0100 Subject: [PATCH 053/183] TST: correct url for test file on s3 (xref #14587) --- pandas/io/tests/parser/test_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 964c927c3c496..9b02096dd0f26 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -30,7 +30,7 @@ def test_url_gz(self): @tm.network def test_url_gz_infer(self): - url = 'https://s3.amazonaws.com/pandas-test/salaries.csv.gz' + url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz' url_table = read_table(url, compression="infer", engine="python") tm.assert_frame_equal(url_table, self.local_table) From 46000daf257c8e574ca943a57023deac74460edd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 11 Nov 2016 20:10:19 -0500 Subject: [PATCH 054/183] DOC: setup for 0.19.2 --- doc/source/whatsnew.rst | 2 ++ doc/source/whatsnew/v0.19.2.txt | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 doc/source/whatsnew/v0.19.2.txt diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 2a1f2cc47d48e..616e1f5c8efc7 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.19.2.txt + .. include:: whatsnew/v0.19.1.txt .. include:: whatsnew/v0.19.0.txt diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt new file mode 100644 index 0000000000000..8a629198e6998 --- /dev/null +++ b/doc/source/whatsnew/v0.19.2.txt @@ -0,0 +1,24 @@ +.. _whatsnew_0192: + +v0.19.2 (December ??, 2016) +--------------------------- + +This is a minor bug-fix release from 0.19.1 and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.19.2 + :local: + :backlinks: none + + +.. _whatsnew_0192.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_0192.bug_fixes: + +Bug Fixes +~~~~~~~~~ From f8bd08e9c2fc6365980f41b846bbae4b40f08b83 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 12 Nov 2016 10:58:54 -0500 Subject: [PATCH 055/183] BUG: segfault manifesting with dateutil=2.6 w.r.t. replace when timezones are present closes #14621 Author: Jeff Reback Closes #14631 from jreback/replace and squashes the following commits: 3f95042 [Jeff Reback] BUG: segfault manifesting with dateutil=2.6 w.r.t. replace when timezones are present --- ci/requirements-3.5_OSX.pip | 2 +- doc/source/whatsnew/v0.19.2.txt | 3 + pandas/tseries/offsets.py | 1 + pandas/tseries/tests/test_offsets.py | 20 ++++-- pandas/tseries/tests/test_timezones.py | 89 +++++++++++++++++++++++- pandas/tseries/tests/test_tslib.py | 5 +- pandas/tslib.pyx | 95 ++++++++++++++++++++++---- 7 files changed, 188 insertions(+), 27 deletions(-) diff --git a/ci/requirements-3.5_OSX.pip b/ci/requirements-3.5_OSX.pip index 8a7f51f1bea9c..d1fc1fe24a079 100644 --- a/ci/requirements-3.5_OSX.pip +++ b/ci/requirements-3.5_OSX.pip @@ -1 +1 @@ -python-dateutil>=2.5.0 +python-dateutil==2.5.3 diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 8a629198e6998..dc11dd17bfdd7 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -22,3 +22,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- compat with ``dateutil==2.6.0`` for testing (:issue:`14621`) +- allow ``nanoseconds`` in ``Timestamp.replace`` kwargs (:issue:`14621`) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 051cc8aa4d018..2e3852a7edddd 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -68,6 +68,7 @@ def wrapper(self, other): other = other.tz_localize(None) result = func(self, other) + if self._adjust_dst: result = tslib._localize_pydatetime(result, tz) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 1735ac4e2efa5..768e9212e6c42 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1,4 +1,5 @@ import os +from distutils.version import LooseVersion from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta from pandas.compat import range, iteritems @@ -4851,6 +4852,7 @@ def _test_all_offsets(self, n, **kwds): def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): offset = DateOffset(**{offset_name: offset_n}) + t = tstart + offset if expected_utc_offset is not None: self.assertTrue(get_utc_offset_hours(t) == expected_utc_offset) @@ -4890,17 +4892,23 @@ def _make_timestamp(self, string, hrs_offset, tz): return Timestamp(string + offset_string).tz_convert(tz) def test_fallback_plural(self): - """test moving from daylight savings to standard time""" + # test moving from daylight savings to standard time + import dateutil for tz, utc_offsets in self.timezone_utc_offsets.items(): hrs_pre = utc_offsets['utc_offset_daylight'] hrs_post = utc_offsets['utc_offset_standard'] - self._test_all_offsets( - n=3, tstart=self._make_timestamp(self.ts_pre_fallback, - hrs_pre, tz), - expected_utc_offset=hrs_post) + + if dateutil.__version__ != LooseVersion('2.6.0'): + # buggy ambiguous behavior in 2.6.0 + # GH 14621 + # https://github.com/dateutil/dateutil/issues/321 + self._test_all_offsets( + n=3, tstart=self._make_timestamp(self.ts_pre_fallback, + hrs_pre, tz), + expected_utc_offset=hrs_post) def test_springforward_plural(self): - """test moving from standard to daylight savings""" + # test moving from standard to daylight savings for tz, utc_offsets in self.timezone_utc_offsets.items(): hrs_pre = utc_offsets['utc_offset_standard'] hrs_post = utc_offsets['utc_offset_daylight'] diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 00e8ee631f463..db8cda5c76479 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -4,7 +4,7 @@ import numpy as np import pytz - +from distutils.version import LooseVersion from pandas.types.dtypes import DatetimeTZDtype from pandas import (Index, Series, DataFrame, isnull, Timestamp) @@ -518,8 +518,12 @@ def f(): times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous='infer') - self.assertEqual(times[0], Timestamp('2013-10-26 23:00', tz=tz)) - self.assertEqual(times[-1], Timestamp('2013-10-27 01:00', tz=tz)) + self.assertEqual(times[0], Timestamp('2013-10-26 23:00', tz=tz, + freq="H")) + if dateutil.__version__ != LooseVersion('2.6.0'): + # GH 14621 + self.assertEqual(times[-1], Timestamp('2013-10-27 01:00', tz=tz, + freq="H")) def test_ambiguous_nat(self): tz = self.tz('US/Eastern') @@ -1163,6 +1167,85 @@ class TestTimeZones(tm.TestCase): def setUp(self): tm._skip_if_no_pytz() + def test_replace(self): + # GH 14621 + # GH 7825 + # replacing datetime components with and w/o presence of a timezone + dt = Timestamp('2016-01-01 09:00:00') + result = dt.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00') + self.assertEqual(result, expected) + + for tz in self.timezones: + dt = Timestamp('2016-01-01 09:00:00', tz=tz) + result = dt.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00', tz=tz) + self.assertEqual(result, expected) + + # we preserve nanoseconds + dt = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + result = dt.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz) + self.assertEqual(result, expected) + + # test all + dt = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + result = dt.replace(year=2015, month=2, day=2, hour=0, minute=5, + second=5, microsecond=5, nanosecond=5) + expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz) + self.assertEqual(result, expected) + + # error + def f(): + dt.replace(foo=5) + self.assertRaises(ValueError, f) + + def f(): + dt.replace(hour=0.1) + self.assertRaises(ValueError, f) + + # assert conversion to naive is the same as replacing tzinfo with None + dt = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern') + self.assertEqual(dt.tz_localize(None), dt.replace(tzinfo=None)) + + def test_ambiguous_compat(self): + # validate that pytz and dateutil are compat for dst + # when the transition happens + tm._skip_if_no_dateutil() + tm._skip_if_no_pytz() + + pytz_zone = 'Europe/London' + dateutil_zone = 'dateutil/Europe/London' + result_pytz = (Timestamp('2013-10-27 01:00:00') + .tz_localize(pytz_zone, ambiguous=0)) + result_dateutil = (Timestamp('2013-10-27 01:00:00') + .tz_localize(dateutil_zone, ambiguous=0)) + self.assertEqual(result_pytz.value, result_dateutil.value) + self.assertEqual(result_pytz.value, 1382835600000000000) + + # dateutil 2.6 buggy w.r.t. ambiguous=0 + if dateutil.__version__ != LooseVersion('2.6.0'): + # GH 14621 + # https://github.com/dateutil/dateutil/issues/321 + self.assertEqual(result_pytz.to_pydatetime().tzname(), + result_dateutil.to_pydatetime().tzname()) + self.assertEqual(str(result_pytz), str(result_dateutil)) + + # 1 hour difference + result_pytz = (Timestamp('2013-10-27 01:00:00') + .tz_localize(pytz_zone, ambiguous=1)) + result_dateutil = (Timestamp('2013-10-27 01:00:00') + .tz_localize(dateutil_zone, ambiguous=1)) + self.assertEqual(result_pytz.value, result_dateutil.value) + self.assertEqual(result_pytz.value, 1382832000000000000) + + # dateutil < 2.6 is buggy w.r.t. ambiguous timezones + if dateutil.__version__ > LooseVersion('2.5.3'): + # GH 14621 + self.assertEqual(str(result_pytz), str(result_dateutil)) + self.assertEqual(result_pytz.to_pydatetime().tzname(), + result_dateutil.to_pydatetime().tzname()) + def test_index_equals_with_tz(self): left = date_range('1/1/2011', periods=100, freq='H', tz='utc') right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 21cfe84f153fa..b45f867be65dd 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -327,8 +327,9 @@ def test_repr(self): # dateutil zone change (only matters for repr) import dateutil - if dateutil.__version__ >= LooseVersion( - '2.3') and dateutil.__version__ <= LooseVersion('2.4.0'): + if (dateutil.__version__ >= LooseVersion('2.3') and + (dateutil.__version__ <= LooseVersion('2.4.0') or + dateutil.__version__ >= LooseVersion('2.6.0'))): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] else: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index d4eaaa0b5cd16..685de214cef7d 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -98,6 +98,7 @@ except NameError: # py3 cdef inline object create_timestamp_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): + """ convenience routine to construct a Timestamp from its parts """ cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, @@ -112,6 +113,7 @@ cdef inline object create_timestamp_from_ts( cdef inline object create_datetime_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): + """ convenience routine to construct a datetime.datetime from its parts """ return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) @@ -378,7 +380,6 @@ class Timestamp(_Timestamp): # Mixing pydatetime positional and keyword arguments is forbidden! cdef _TSObject ts - cdef _Timestamp ts_base if offset is not None: # deprecate offset kwd in 0.19.0, GH13593 @@ -412,17 +413,7 @@ class Timestamp(_Timestamp): from pandas.tseries.frequencies import to_offset freq = to_offset(freq) - # make datetime happy - ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, - ts.dts.day, ts.dts.hour, ts.dts.min, - ts.dts.sec, ts.dts.us, ts.tzinfo) - - # fill out rest of data - ts_base.value = ts.value - ts_base.freq = freq - ts_base.nanosecond = ts.dts.ps / 1000 - - return ts_base + return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) def _round(self, freq, rounder): @@ -660,8 +651,80 @@ class Timestamp(_Timestamp): astimezone = tz_convert def replace(self, **kwds): - return Timestamp(datetime.replace(self, **kwds), - freq=self.freq) + """ + implements datetime.replace, handles nanoseconds + + Parameters + ---------- + kwargs: key-value dict + + accepted keywords are: + year, month, day, hour, minute, second, microsecond, nanosecond, tzinfo + + values must be integer, or for tzinfo, a tz-convertible + + Returns + ------- + Timestamp with fields replaced + """ + + cdef: + pandas_datetimestruct dts + int64_t value + object tzinfo, result, k, v + _TSObject ts + + # set to naive if needed + tzinfo = self.tzinfo + value = self.value + if tzinfo is not None: + value = tz_convert_single(value, 'UTC', tzinfo) + + # setup components + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + dts.ps = self.nanosecond * 1000 + + # replace + def validate(k, v): + """ validate integers """ + if not isinstance(v, int): + raise ValueError("value must be an integer, received {v} for {k}".format(v=type(v), k=k)) + return v + + for k, v in kwds.items(): + if k == 'year': + dts.year = validate(k, v) + elif k == 'month': + dts.month = validate(k, v) + elif k == 'day': + dts.day = validate(k, v) + elif k == 'hour': + dts.hour = validate(k, v) + elif k == 'minute': + dts.min = validate(k, v) + elif k == 'second': + dts.sec = validate(k, v) + elif k == 'microsecond': + dts.us = validate(k, v) + elif k == 'nanosecond': + dts.ps = validate(k, v) * 1000 + elif k == 'tzinfo': + tzinfo = v + else: + raise ValueError("invalid name {} passed".format(k)) + + # reconstruct & check bounds + value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + if value != NPY_NAT: + _check_dts_bounds(&dts) + + # set tz if needed + if tzinfo is not None: + value = tz_convert_single(value, tzinfo, 'UTC') + + result = create_timestamp_from_ts(value, dts, tzinfo, self.freq) + + return result def isoformat(self, sep='T'): base = super(_Timestamp, self).isoformat(sep=sep) @@ -5041,7 +5104,9 @@ cpdef normalize_date(object dt): ------- normalized : datetime.datetime or Timestamp """ - if PyDateTime_Check(dt): + if is_timestamp(dt): + return dt.replace(hour=0, minute=0, second=0, microsecond=0, nanosecond=0) + elif PyDateTime_Check(dt): return dt.replace(hour=0, minute=0, second=0, microsecond=0) elif PyDate_Check(dt): return datetime(dt.year, dt.month, dt.day) From 3552dc0c4533a5eafafe859f5afd29a7ce063e03 Mon Sep 17 00:00:00 2001 From: Mykola Golubyev Date: Sat, 12 Nov 2016 11:07:00 -0500 Subject: [PATCH 056/183] TST: Fix trailing current date zeros flaky test_format problem closes #14626 Author: Mykola Golubyev Closes #14638 from MykolaGolubyev/issue-14626 and squashes the following commits: 166ec23 [Mykola Golubyev] Inline datetime_now_without_trailing_zeros within test_format 68a72a7 [Mykola Golubyev] Fix trailing current date zeros flaky test_format problem --- pandas/tests/indexes/test_base.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b839ed6331457..ad7e3890b5f32 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -913,7 +913,19 @@ def test_summary(self): def test_format(self): self._check_method_works(Index.format) - index = Index([datetime.now()]) + # GH 14626 + # our formatting is different by definition when we have + # ms vs us precision (e.g. trailing zeros); + # so don't compare this case + def datetime_now_without_trailing_zeros(): + now = datetime.now() + + while str(now).endswith("000"): + now = datetime.now() + + return now + + index = Index([datetime_now_without_trailing_zeros()]) # windows has different precision on datetime.datetime.now (it doesn't # include us since the default for Timestamp shows these but Index From 1d6dbb41b26a39121ec8c4f19f5da78bb0ab4af7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 12 Nov 2016 12:44:06 -0500 Subject: [PATCH 057/183] TST: skip test_gbq.test_upload_data_if_table_exists_replace for now --- pandas/io/tests/test_gbq.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index cca1580b84195..f6ff35a6db0d1 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -824,6 +824,9 @@ def test_upload_data_if_table_exists_append(self): private_key=_get_private_key_path()) def test_upload_data_if_table_exists_replace(self): + + raise nose.SkipTest("buggy test") + destination_table = DESTINATION_TABLE + "4" test_size = 10 From 1606153c0eb753fd1ac3c013dd29e23435e6803f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Nov 2016 14:29:25 -0600 Subject: [PATCH 058/183] COMPAT: Cast to string before raise in read_stata (#14657) Fix ValueError message in StataReader Cast to string before joining the list of typecodes in the error message. --- pandas/io/stata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 985ea9c051505..14bd670862b41 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1210,18 +1210,18 @@ def _read_old_header(self, first_char): if tp in self.OLD_TYPE_MAPPING: typlist.append(self.OLD_TYPE_MAPPING[tp]) else: - typlist.append(tp - 127) # string + typlist.append(tp - 127) # py2 string, py3 bytes try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except: raise ValueError("cannot convert stata types [{0}]" - .format(','.join(typlist))) + .format(','.join(str(x) for x in typlist))) try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except: raise ValueError("cannot convert stata dtypes [{0}]" - .format(','.join(typlist))) + .format(','.join(str(x) for x in typlist))) if self.format_version > 108: self.varlist = [self._null_terminate(self.path_or_buf.read(33)) From 52241a7d8a8be2cc1ae40cbde054b0983ac1f429 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 14 Nov 2016 15:49:01 -0500 Subject: [PATCH 059/183] BLD: fix linting check for .pyx closes #14659 --- ci/lint.sh | 12 +++++++++--- pandas/tslib.pyx | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 115a2cdaf7899..d6390a16b763e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -10,18 +10,24 @@ if [ "$LINT" ]; then # pandas/rpy is deprecated and will be removed. # pandas/src is C code, so no need to search there. echo "Linting *.py" - flake8 pandas --filename '*.py' --exclude pandas/rpy,pandas/src + flake8 pandas --filename=*.py --exclude pandas/rpy,pandas/src + if [ $? -ne "0" ]; then + RET=1 + fi echo "Linting *.py DONE" echo "Linting *.pyx" - flake8 pandas --filename '*.pyx' --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 + flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 + if [ $? -ne "0" ]; then + RET=1 + fi echo "Linting *.pyx DONE" echo "Linting *.pxi.in" for path in 'src' do echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.pxi.in' --select=E501,E302,E203,E111,E114,E221,E303,E231,E126 + flake8 pandas/$path --filename=*.pxi.in --select=E501,E302,E203,E111,E114,E221,E303,E231,E126 if [ $? -ne "0" ]; then RET=1 fi diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 685de214cef7d..91d3f0ef70cfe 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -688,7 +688,8 @@ class Timestamp(_Timestamp): def validate(k, v): """ validate integers """ if not isinstance(v, int): - raise ValueError("value must be an integer, received {v} for {k}".format(v=type(v), k=k)) + raise ValueError("value must be an integer, received " + "{v} for {k}".format(v=type(v), k=k)) return v for k, v in kwds.items(): @@ -5105,7 +5106,8 @@ cpdef normalize_date(object dt): normalized : datetime.datetime or Timestamp """ if is_timestamp(dt): - return dt.replace(hour=0, minute=0, second=0, microsecond=0, nanosecond=0) + return dt.replace(hour=0, minute=0, second=0, microsecond=0, + nanosecond=0) elif PyDateTime_Check(dt): return dt.replace(hour=0, minute=0, second=0, microsecond=0) elif PyDate_Check(dt): From 3f523f3652602c7105194b19b72ff30bbff47258 Mon Sep 17 00:00:00 2001 From: sakkemo Date: Tue, 15 Nov 2016 10:28:25 +0200 Subject: [PATCH 060/183] DOC: Update Series.mode docstring (#14653) The sort keyword has never existed, and output is always sorted --- pandas/core/series.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 188204d83d985..2310e75f3d3fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1216,16 +1216,10 @@ def count(self, level=None): dtype='int64').__finalize__(self) def mode(self): - """Returns the mode(s) of the dataset. + """Return the mode(s) of the dataset. - Empty if nothing occurs at least 2 times. Always returns Series even - if only one value. - - Parameters - ---------- - sort : bool, default True - If True, will lexicographically sort values, if False skips - sorting. Result ordering when ``sort=False`` is not defined. + Empty if nothing occurs at least 2 times. Always returns Series even + if only one value is returned. Returns ------- From a7604fcd8e105221c5cd5d469be9a3a308325631 Mon Sep 17 00:00:00 2001 From: Bill Chambers Date: Tue, 15 Nov 2016 18:09:19 -0800 Subject: [PATCH 061/183] fix ##14664 (#14665) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e18b65e80385..fbc6333dd6fdd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1066,7 +1066,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. - lines : boolean, defalut False + lines : boolean, default False If 'orient' is 'records' write out line delimited json format. Will throw ValueError if incorrect 'orient' since others are not list like. From 4814823903b862c411caf527271e384df0d0d7e7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 16 Nov 2016 01:06:06 -0800 Subject: [PATCH 062/183] DOC: Explain pivot vs. pivot_table (#6950) (#14650) Move pivot paragraph to pivot table section add pivot_table reference to pivot --- doc/source/reshaping.rst | 4 ++++ pandas/core/reshape.py | 5 +++++ pandas/tools/pivot.py | 5 +++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 9ed2c42610b69..3a2c48834991f 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -323,6 +323,10 @@ Pivot tables .. _reshaping.pivot: +While ``pivot`` provides general purpose pivoting of DataFrames with various +data types (strings, numerics, etc.), Pandas also provides the ``pivot_table`` +function for pivoting with aggregation of numeric data. + The function ``pandas.pivot_table`` can be used to create spreadsheet-style pivot tables. See the :ref:`cookbook` for some advanced strategies diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index fa5d16bd85e98..055a0041b181a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -357,6 +357,11 @@ def pivot_simple(index, columns, values): Returns ------- DataFrame + + See also + -------- + DataFrame.pivot_table : generalization of pivot that can handle + duplicate values for one index/column pair """ if (len(index) != len(columns)) or (len(columns) != len(values)): raise AssertionError('Length of index, columns, and values must be the' diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 9e064a1d1fc99..820a545363ee3 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -75,6 +75,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', Returns ------- table : DataFrame + + See also + -------- + DataFrame.pivot : pivot without aggregation that can handle + non-numeric data """ index = _convert_by(index) columns = _convert_by(columns) From 726efc7cd55744eb88636dd6f11293f18355a10a Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Wed, 16 Nov 2016 15:57:42 -0500 Subject: [PATCH 063/183] BUG: don't allow users to move from an interned string (#14494) --- pandas/tests/test_util.py | 43 +++++++++++++++++++++++++++++++++++++++ pandas/util/move.c | 8 ++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index 9193880df7feb..f5828dab21e37 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -2,6 +2,9 @@ import nose from collections import OrderedDict +import sys +import unittest +from uuid import uuid4 from pandas.util._move import move_into_mutable_buffer, BadMove from pandas.util.decorators import deprecate_kwarg from pandas.util.validators import (validate_args, validate_kwargs, @@ -325,6 +328,46 @@ def test_exactly_one_ref(self): # materialize as bytearray to show that it is mutable self.assertEqual(bytearray(as_stolen_buf), b'test') + @unittest.skipIf( + sys.version_info[0] > 2, + 'bytes objects cannot be interned in py3', + ) + def test_interned(self): + salt = uuid4().hex + + def make_string(): + # We need to actually create a new string so that it has refcount + # one. We use a uuid so that we know the string could not already + # be in the intern table. + return ''.join(('testing: ', salt)) + + # This should work, the string has one reference on the stack. + move_into_mutable_buffer(make_string()) + + refcount = [None] # nonlocal + + def ref_capture(ob): + # Subtract two because those are the references owned by this + # frame: + # 1. The local variables of this stack frame. + # 2. The python data stack of this stack frame. + refcount[0] = sys.getrefcount(ob) - 2 + return ob + + with tm.assertRaises(BadMove): + # If we intern the string it will still have one reference but now + # it is in the intern table so if other people intern the same + # string while the mutable buffer holds the first string they will + # be the same instance. + move_into_mutable_buffer(ref_capture(intern(make_string()))) # noqa + + self.assertEqual( + refcount[0], + 1, + msg='The BadMove was probably raised for refcount reasons instead' + ' of interning reasons', + ) + def test_numpy_errstate_is_default(): # The defaults since numpy 1.6.0 diff --git a/pandas/util/move.c b/pandas/util/move.c index 68fcad793e16c..fb918c302b100 100644 --- a/pandas/util/move.c +++ b/pandas/util/move.c @@ -7,6 +7,9 @@ #define PyString_CheckExact PyBytes_CheckExact #define PyString_AS_STRING PyBytes_AS_STRING #define PyString_GET_SIZE PyBytes_GET_SIZE + +/* in python 3, we cannot intern bytes objects so this is always false */ +#define PyString_CHECK_INTERNED(cs) 0 #endif /* !COMPILING_IN_PY2 */ #ifndef Py_TPFLAGS_HAVE_GETCHARBUFFER @@ -113,8 +116,9 @@ stolenbuf_new(PyObject *self, PyObject *args, PyObject *kwargs) return NULL; } - if (Py_REFCNT(bytes_rvalue) != 1) { - /* there is a reference other than the caller's stack */ + if (Py_REFCNT(bytes_rvalue) != 1 || PyString_CHECK_INTERNED(bytes_rvalue)) { + /* there is a reference other than the caller's stack or the string is + interned */ PyErr_SetObject(badmove, bytes_rvalue); return NULL; } From 2fc0c68ace1cb447f1fa6f016295575a2024db3d Mon Sep 17 00:00:00 2001 From: Luca Scarabello Date: Thu, 17 Nov 2016 07:44:29 -0500 Subject: [PATCH 064/183] BUG: pandas.cut and negative values #14652 closes #14652 Author: Luca Scarabello Author: Luca Closes #14663 from luca-s/issue_14652 and squashes the following commits: 8db26db [Luca Scarabello] Moved new test location to pandas\tools\tests\test_tile.py 90dd07d [Luca Scarabello] Updated whatsnew d6b3da8 [Luca Scarabello] fixed flake8 compliance fdc55b9 [Luca Scarabello] Added test case for #14652 d5790e2 [Luca Scarabello] updated whatsnew v0.19.2 2db0c7a [Luca] BUG: pandas.cut and negative values #14652 --- doc/source/whatsnew/v0.19.2.txt | 5 +++-- pandas/tools/tests/test_tile.py | 12 ++++++++++++ pandas/tools/tile.py | 4 ++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index dc11dd17bfdd7..8feb5355bb295 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -23,5 +23,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- compat with ``dateutil==2.6.0`` for testing (:issue:`14621`) -- allow ``nanoseconds`` in ``Timestamp.replace`` kwargs (:issue:`14621`) +- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) +- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) +- Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 16731620a1dcd..e5b9c65b515d6 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -271,6 +271,18 @@ def test_series_retbins(self): np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + def test_single_bin(self): + # issue 14652 + expected = Series([0, 0]) + + s = Series([9., 9.]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([-9., -9.]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 62bbfc2f630a5..ef75f2f84779b 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -98,8 +98,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, mn, mx = [mi + 0.0 for mi in rng] if mn == mx: # adjust end points before binning - mn -= .001 * mn - mx += .001 * mx + mn -= .001 * abs(mn) + mx += .001 * abs(mx) bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) From fe555db3f178b57f1d15c6c30f7bea0ca452db68 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 10 Nov 2016 22:45:56 +0000 Subject: [PATCH 065/183] ENH: Explicit range checking of floats when writing Stata Add explicit error checking for out-of-range doubles when writing Stata files Upcasts float32 to float64 if out-of-range values encountered Tests for infinite values and raises if found closes #14618 closes #14637 --- doc/source/whatsnew/v0.19.2.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/stata.py | 17 ++++++++++++ pandas/io/tests/test_stata.py | 49 +++++++++++++++++++++++++++++++-- 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 8feb5355bb295..1c042ec571f87 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -26,3 +26,4 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`) +- Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 660300e1814e8..8819a95f27b0d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -80,3 +80,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 14bd670862b41..c35e07be2c31a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -511,6 +511,9 @@ def _cast_to_stata_types(data): (np.uint16, np.int16, np.int32), (np.uint32, np.int32, np.int64)) + float32_max = struct.unpack('= 2 ** 53 or data[col].min() <= -2 ** 53: ws = precision_loss_doc % ('int64', 'float64') + elif dtype in (np.float32, np.float64): + value = data[col].max() + if np.isinf(value): + msg = 'Column {0} has a maximum value of infinity which is ' \ + 'outside the range supported by Stata.' + raise ValueError(msg.format(col)) + if dtype == np.float32 and value > float32_max: + data[col] = data[col].astype(np.float64) + elif dtype == np.float64: + if value > float64_max: + msg = 'Column {0} has a maximum value ({1}) outside the ' \ + 'range supported by Stata ({1})' + raise ValueError(msg.format(col, value, float64_max)) if ws: import warnings @@ -2048,6 +2064,7 @@ def _prepare_pandas(self, data): data = self._check_column_names(data) # Check columns for compatibility with stata, upcast if necessary + # Raise if outside the supported range data = _cast_to_stata_types(data) # Replace NaNs with Stata missing values diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1849b32a4a7c8..cd972868a6e32 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -11,8 +11,6 @@ import nose import numpy as np -from pandas.tslib import NaT - import pandas as pd import pandas.util.testing as tm from pandas import compat @@ -21,6 +19,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) +from pandas.tslib import NaT from pandas.types.common import is_categorical_dtype @@ -1234,6 +1233,52 @@ def test_stata_111(self): original = original[['y', 'x', 'w', 'z']] tm.assert_frame_equal(original, df) + def test_out_of_range_double(self): + # GH 14618 + df = DataFrame({'ColumnOk': [0.0, + np.finfo(np.double).eps, + 4.49423283715579e+307], + 'ColumnTooBig': [0.0, + np.finfo(np.double).eps, + np.finfo(np.double).max]}) + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + df.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + + df.loc[2, 'ColumnTooBig'] = np.inf + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + df.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + tm.assertTrue('infinity' in cm.exception) + + def test_out_of_range_float(self): + original = DataFrame({'ColumnOk': [0.0, + np.finfo(np.float32).eps, + np.finfo(np.float32).max / 10.0], + 'ColumnTooBig': [0.0, + np.finfo(np.float32).eps, + np.finfo(np.float32).max]}) + original.index.name = 'index' + for col in original: + original[col] = original[col].astype(np.float32) + + with tm.ensure_clean() as path: + original.to_stata(path) + reread = read_stata(path) + original['ColumnTooBig'] = original['ColumnTooBig'].astype( + np.float64) + tm.assert_frame_equal(original, + reread.set_index('index')) + + original.loc[2, 'ColumnTooBig'] = np.inf + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + original.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + tm.assertTrue('infinity' in cm.exception) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 7e6693797674611353c8e3ae6fe405114bb8140c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 07:55:07 -0500 Subject: [PATCH 066/183] DOC: add whitespace to bug fix section of 0.19.2 --- doc/source/whatsnew/v0.19.2.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 1c042ec571f87..978e7616c1018 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -25,5 +25,20 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) + + + + + + + - Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`) + + + + + + + + - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) From b5864b0af00ef9406dca5f4988fd79ff8341bbc6 Mon Sep 17 00:00:00 2001 From: Nicholas Ver Halen Date: Wed, 12 Oct 2016 16:13:38 -0500 Subject: [PATCH 067/183] BUG: to_numeric downcast = 'unsigned' would not un-sign a 0 value closes #14504 closes #14401 --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/tools/tests/test_util.py | 38 ++++++++++++++++++++++++++++++++- pandas/tools/util.py | 2 +- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 978e7616c1018..f193de7fbdbd0 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -33,6 +33,7 @@ Bug Fixes - Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`) +- Bug in ``pd.to_numeric`` where a 0 was not unsigned on a ``downcast='unsigned'`` argument (:issue:`14401`) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 8c16308d79a31..f9647721e3c5b 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -4,9 +4,10 @@ import nose import numpy as np +from numpy import iinfo import pandas as pd -from pandas import date_range, Index +from pandas import (date_range, Index, _np_version_under1p9) import pandas.util.testing as tm from pandas.tools.util import cartesian_product, to_numeric @@ -401,6 +402,41 @@ def test_downcast(self): res = pd.to_numeric(data, downcast=downcast) tm.assert_numpy_array_equal(res, expected) + def test_downcast_limits(self): + # Test the limits of each downcast. Bug: #14401. + # Check to make sure numpy is new enough to run this test. + if _np_version_under1p9: + raise nose.SkipTest("Numpy version is under 1.9") + + i = 'integer' + u = 'unsigned' + dtype_downcast_min_max = [ + ('int8', i, [iinfo(np.int8).min, iinfo(np.int8).max]), + ('int16', i, [iinfo(np.int16).min, iinfo(np.int16).max]), + ('int32', i, [iinfo(np.int32).min, iinfo(np.int32).max]), + ('int64', i, [iinfo(np.int64).min, iinfo(np.int64).max]), + ('uint8', u, [iinfo(np.uint8).min, iinfo(np.uint8).max]), + ('uint16', u, [iinfo(np.uint16).min, iinfo(np.uint16).max]), + ('uint32', u, [iinfo(np.uint32).min, iinfo(np.uint32).max]), + # Test will be skipped until there is more uint64 support. + # ('uint64', u, [iinfo(uint64).min, iinfo(uint64).max]), + ('int16', i, [iinfo(np.int8).min, iinfo(np.int8).max + 1]), + ('int32', i, [iinfo(np.int16).min, iinfo(np.int16).max + 1]), + ('int64', i, [iinfo(np.int32).min, iinfo(np.int32).max + 1]), + ('int16', i, [iinfo(np.int8).min - 1, iinfo(np.int16).max]), + ('int32', i, [iinfo(np.int16).min - 1, iinfo(np.int32).max]), + ('int64', i, [iinfo(np.int32).min - 1, iinfo(np.int64).max]), + ('uint16', u, [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), + ('uint32', u, [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), + # Test will be skipped until there is more uint64 support. + # ('uint64', u, [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]), + ] + + for dtype, downcast, min_max in dtype_downcast_min_max: + series = pd.to_numeric(pd.Series(min_max), downcast=downcast) + tm.assert_equal(series.dtype, dtype) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index fec56328c1721..b50bf9dc448bc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -205,7 +205,7 @@ def to_numeric(arg, errors='raise', downcast=None): if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) > 0: + elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] From b23a329edf362baba84e04f4d7a9f939d76a0edf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 08:31:01 -0500 Subject: [PATCH 068/183] BLD: add py3.6 build generic support for additional installation instructions (e.g. 3.4_slow build) remove 2.7_NUMPY_DEV build files (not longer on travis) use python 3.5 for root conda env, use PYTHON_VERSION to designate conda installation --- .travis.yml | 54 +++++++++++++++++-- ci/install-2.7_NUMPY_DEV.sh | 18 ------- ci/install-3.6_DEV.sh | 29 ++++++++++ ci/install_travis.sh | 49 +++++++++-------- ci/requirements-2.7_NUMPY_DEV.build | 3 -- ci/requirements-2.7_NUMPY_DEV.run | 2 - ci/requirements-3.4_SLOW.sh | 7 +++ ...Y_DEV.sh => requirements-3.5_NUMPY_DEV.sh} | 0 8 files changed, 112 insertions(+), 50 deletions(-) delete mode 100644 ci/install-2.7_NUMPY_DEV.sh create mode 100644 ci/install-3.6_DEV.sh delete mode 100644 ci/requirements-2.7_NUMPY_DEV.build delete mode 100644 ci/requirements-2.7_NUMPY_DEV.run create mode 100644 ci/requirements-3.4_SLOW.sh rename ci/{install-3.5_NUMPY_DEV.sh => requirements-3.5_NUMPY_DEV.sh} (100%) diff --git a/.travis.yml b/.travis.yml index 4eefd6ca83694..9e54fecc72ecf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,7 @@ matrix: compiler: clang osx_image: xcode6.4 env: + - PYTHON_VERSION=3.5 - JOB_NAME: "35_osx" - NOSE_ARGS="not slow and not network and not disabled" - BUILD_TYPE=conda @@ -43,6 +44,7 @@ matrix: - USE_CACHE=true - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow_nnet_LOCALE" - NOSE_ARGS="slow and not network and not disabled" - LOCALE_OVERRIDE="zh_CN.UTF-8" @@ -56,6 +58,7 @@ matrix: - language-pack-zh-hans - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow" - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true @@ -69,6 +72,7 @@ matrix: - python-gtk2 - python: 3.4 env: + - PYTHON_VERSION=3.4 - JOB_NAME: "34_nslow" - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true @@ -81,6 +85,7 @@ matrix: - xsel - python: 3.5 env: + - PYTHON_VERSION=3.5 - JOB_NAME: "35_nslow" - NOSE_ARGS="not slow and not network and not disabled" - FULL_DEPS=true @@ -95,6 +100,7 @@ matrix: # In allow_failures - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow" - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" @@ -104,6 +110,7 @@ matrix: # In allow_failures - python: 3.4 env: + - PYTHON_VERSION=3.4 - JOB_NAME: "34_slow" - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" @@ -118,6 +125,7 @@ matrix: # In allow_failures - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_build_test_conda" - JOB_TAG=_BUILD_TEST - NOSE_ARGS="not slow and not disabled" @@ -125,9 +133,23 @@ matrix: - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" - USE_CACHE=true +# In allow_failures + - python: 3.6-dev + env: + - PYTHON_VERSION=3.6 + - JOB_NAME: "36_dev" + - JOB_TAG=_DEV + - NOSE_ARGS="not slow and not network and not disabled" + - PANDAS_TESTING_MODE="deprecate" + addons: + apt: + packages: + - libatlas-base-dev + - gfortran # In allow_failures - python: 3.5 env: + - PYTHON_VERSION=3.5 - JOB_NAME: "35_numpy_dev" - JOB_TAG=_NUMPY_DEV - NOSE_ARGS="not slow and not network and not disabled" @@ -142,6 +164,7 @@ matrix: # In allow_failures - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow_nnet_COMPAT" - NOSE_ARGS="not slow and not network and not disabled" - LOCALE_OVERRIDE="it_IT.UTF-8" @@ -156,6 +179,7 @@ matrix: # In allow_failures - python: 3.5 env: + - PYTHON_VERSION=3.5 - JOB_NAME: "35_ascii" - JOB_TAG=_ASCII - NOSE_ARGS="not slow and not network and not disabled" @@ -165,6 +189,7 @@ matrix: # In allow_failures - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "doc_build" - FULL_DEPS=true - DOC_BUILD=true @@ -174,6 +199,7 @@ matrix: allow_failures: - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_slow" - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" @@ -182,6 +208,7 @@ matrix: - USE_CACHE=true - python: 3.4 env: + - PYTHON_VERSION=3.4 - JOB_NAME: "34_slow" - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" @@ -195,6 +222,7 @@ matrix: - xsel - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_build_test_conda" - JOB_TAG=_BUILD_TEST - NOSE_ARGS="not slow and not disabled" @@ -202,14 +230,27 @@ matrix: - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" - USE_CACHE=true - - python: 3.5 + - python: 3.6-dev env: - - JOB_NAME: "35_numpy_dev" - - JOB_TAG=_NUMPY_DEV + - PYTHON_VERSION=3.6 + - JOB_NAME: "36_dev" + - JOB_TAG=_DEV - NOSE_ARGS="not slow and not network and not disabled" - PANDAS_TESTING_MODE="deprecate" - - CACHE_NAME="35_numpy_dev" - - USE_CACHE=true + addons: + apt: + packages: + - libatlas-base-dev + - gfortran + - python: 3.5 + env: + - PYTHON_VERSION=3.5 + - JOB_NAME: "35_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - NOSE_ARGS="not slow and not network and not disabled" + - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true addons: apt: packages: @@ -217,6 +258,7 @@ matrix: - gfortran - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "27_nslow_nnet_COMPAT" - NOSE_ARGS="not slow and not network and not disabled" - LOCALE_OVERRIDE="it_IT.UTF-8" @@ -230,6 +272,7 @@ matrix: - language-pack-it - python: 3.5 env: + - PYTHON_VERSION=3.5 - JOB_NAME: "35_ascii" - JOB_TAG=_ASCII - NOSE_ARGS="not slow and not network and not disabled" @@ -238,6 +281,7 @@ matrix: - USE_CACHE=true - python: 2.7 env: + - PYTHON_VERSION=2.7 - JOB_NAME: "doc_build" - FULL_DEPS=true - DOC_BUILD=true diff --git a/ci/install-2.7_NUMPY_DEV.sh b/ci/install-2.7_NUMPY_DEV.sh deleted file mode 100644 index 22ac8f6547879..0000000000000 --- a/ci/install-2.7_NUMPY_DEV.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install numpy master wheel" - -# remove the system installed numpy -pip uninstall numpy -y - -# we need these for numpy - -# these wheels don't play nice with the conda libgfortran / openblas -# time conda install -n pandas libgfortran openblas || exit 1 - -# install numpy wheel from master -pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy - -true diff --git a/ci/install-3.6_DEV.sh b/ci/install-3.6_DEV.sh new file mode 100644 index 0000000000000..e24533313c713 --- /dev/null +++ b/ci/install-3.6_DEV.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +echo "install 3.6 dev" + +conda config --set add_pip_as_python_dependency false +conda create -n pandas python=3.6 -c conda-forge/label/prerelease + +source activate pandas + +# ensure we have pip +python -m ensurepip +pip3.6 install nose + +# build cython +git clone https://github.com/cython/cython.git +cd cython +git checkout 0.25.1 +python setup.py install +cd .. + +# remove the system installed numpy +pip3.6 uninstall numpy -y + +# install deps +pip3.6 install numpy +pip3.6 install pytz +pip3.6 install python-dateutil + +true diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 98ce36acc096e..39f6808b9d317 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -31,9 +31,6 @@ edit_init home_dir=$(pwd) echo "home_dir: [$home_dir]" -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" - MINICONDA_DIR="$HOME/miniconda" if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then @@ -63,9 +60,9 @@ else rm -rf "$MINICONDA_DIR" # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 else - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 fi bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 @@ -84,21 +81,25 @@ else # Useful for debugging any issues with conda conda info -a || exit 1 - - time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose coverage flake8 || exit 1 - fi -# build deps -REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.build" -# may have additional installation instructions for this build -INSTALL="ci/install-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.sh" +# may have installation instructions for this build +INSTALL="ci/install-${PYTHON_VERSION}${JOB_TAG}.sh" if [ -e ${INSTALL} ]; then time bash $INSTALL || exit 1 +else + + # create new env + time conda create -n pandas python=$PYTHON_VERSION nose coverage flake8 || exit 1 fi +# build deps +REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build" + # install deps -time conda install -n pandas --file=${REQ} || exit 1 +if [ -e ${REQ} ]; then + time conda install -n pandas --file=${REQ} || exit 1 +fi source activate pandas @@ -106,7 +107,7 @@ if [ "$BUILD_TEST" ]; then # build testing pip uninstall --yes cython - pip install cython==0.15.1 + pip install cython==0.19.1 ( python setup.py build_ext --inplace && python setup.py develop ) || true else @@ -117,14 +118,22 @@ else # we may have run installations echo "conda installs" - REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.run" - time conda install -n pandas --file=${REQ} || exit 1 + REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run" + if [ -e ${REQ} ]; then + time conda install -n pandas --file=${REQ} || exit 1 + fi # we may have additional pip installs echo "pip installs" - REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.pip" + REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip" if [ -e ${REQ} ]; then - pip install --upgrade -r $REQ + pip install --upgrade -r $REQ + fi + + # may have addtl installation instructions for this build + REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.sh" + if [ -e ${REQ} ]; then + time bash $REQ || exit 1 fi # remove any installed pandas package @@ -138,9 +147,5 @@ else fi -if [ "$JOB_NAME" == "34_slow" ]; then - conda install -c conda-forge/label/rc -c conda-forge matplotlib -fi - echo "done" exit 0 diff --git a/ci/requirements-2.7_NUMPY_DEV.build b/ci/requirements-2.7_NUMPY_DEV.build deleted file mode 100644 index d15edbfa3d2c1..0000000000000 --- a/ci/requirements-2.7_NUMPY_DEV.build +++ /dev/null @@ -1,3 +0,0 @@ -python-dateutil -pytz -cython diff --git a/ci/requirements-2.7_NUMPY_DEV.run b/ci/requirements-2.7_NUMPY_DEV.run deleted file mode 100644 index 0aa987baefb1d..0000000000000 --- a/ci/requirements-2.7_NUMPY_DEV.run +++ /dev/null @@ -1,2 +0,0 @@ -python-dateutil -pytz diff --git a/ci/requirements-3.4_SLOW.sh b/ci/requirements-3.4_SLOW.sh new file mode 100644 index 0000000000000..bc8fb79147d2c --- /dev/null +++ b/ci/requirements-3.4_SLOW.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "install 34_slow" + +conda install -n pandas -c conda-forge/label/rc -c conda-forge matplotlib diff --git a/ci/install-3.5_NUMPY_DEV.sh b/ci/requirements-3.5_NUMPY_DEV.sh similarity index 100% rename from ci/install-3.5_NUMPY_DEV.sh rename to ci/requirements-3.5_NUMPY_DEV.sh From b52dda8fa4fa3818730f7e3493a6fe1108684918 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 12:40:26 -0500 Subject: [PATCH 069/183] ERR: fix exception propogation for datetime parsing functions, noted in python 3.6 closes #14561 Author: Jeff Reback Closes #14678 from jreback/py3.6_fix and squashes the following commits: c8eed83 [Jeff Reback] ERR: fix exception propogation for datetime parsing functions, noted in python 3.6 --- doc/source/whatsnew/v0.19.2.txt | 2 +- pandas/src/datetime.pxd | 8 ++++---- setup.py | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index f193de7fbdbd0..f4a45a6938a95 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -38,7 +38,7 @@ Bug Fixes - +- Bug in not propogating exceptions in parsing invalid datetimes, noted in python 3.6 (:issue:`14561`) diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd index 5f7de8244d17e..d3d471a33715d 100644 --- a/pandas/src/datetime.pxd +++ b/pandas/src/datetime.pxd @@ -126,8 +126,8 @@ cdef extern from "datetime/np_datetime_strings.h": -cdef inline _string_to_dts(object val, pandas_datetimestruct* dts, - int* out_local, int* out_tzoffset): +cdef inline int _string_to_dts(object val, pandas_datetimestruct* dts, + int* out_local, int* out_tzoffset) except? -1: cdef int result cdef char *tmp @@ -139,10 +139,11 @@ cdef inline _string_to_dts(object val, pandas_datetimestruct* dts, if result == -1: raise ValueError('Unable to parse %s' % str(val)) + return result cdef inline int _cstring_to_dts(char *val, int length, pandas_datetimestruct* dts, - int* out_local, int* out_tzoffset): + int* out_local, int* out_tzoffset) except? -1: cdef: npy_bool special PANDAS_DATETIMEUNIT out_bestunit @@ -195,4 +196,3 @@ cdef inline int64_t _date_to_datetime64(object val, dts.hour = dts.min = dts.sec = dts.us = 0 dts.ps = dts.as = 0 return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) - diff --git a/setup.py b/setup.py index a17dd502d7706..351d2b39ce6aa 100755 --- a/setup.py +++ b/setup.py @@ -454,7 +454,8 @@ def pxd(name): tseries_depends = ['pandas/src/datetime/np_datetime.h', 'pandas/src/datetime/np_datetime_strings.h', - 'pandas/src/period_helper.h'] + 'pandas/src/period_helper.h', + 'pandas/src/datetime.pxd'] # some linux distros require it From 313b8e4a3fbbf747ed4e57b8ade465775b48677e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 12:54:23 -0500 Subject: [PATCH 070/183] BLD: use miniconda3 as the miniconda dir --- ci/install_travis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 39f6808b9d317..bdd2c01f611b2 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -31,7 +31,7 @@ edit_init home_dir=$(pwd) echo "home_dir: [$home_dir]" -MINICONDA_DIR="$HOME/miniconda" +MINICONDA_DIR="$HOME/miniconda3" if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then echo "Miniconda install already present from cache: $MINICONDA_DIR" From 908c224261db35a6ab0e6455383c10a17cd829a8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 13:09:32 -0500 Subject: [PATCH 071/183] BLD: use correct path for travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9e54fecc72ecf..49765c9df96ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -293,7 +293,7 @@ before_install: - echo "before_install" - source ci/travis_process_gbq_encryption.sh - echo $VIRTUAL_ENV - - export PATH="$HOME/miniconda/bin:$PATH" + - export PATH="$HOME/miniconda3/bin:$PATH" - df -h - date - pwd From 45543ec22bb01d5c46eff9384491d218cffddd64 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 13:03:16 -0500 Subject: [PATCH 072/183] BLD: cleaner 3.6 deps install --- ci/install-3.6_DEV.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/ci/install-3.6_DEV.sh b/ci/install-3.6_DEV.sh index e24533313c713..0b95f1cd45cad 100644 --- a/ci/install-3.6_DEV.sh +++ b/ci/install-3.6_DEV.sh @@ -9,21 +9,8 @@ source activate pandas # ensure we have pip python -m ensurepip -pip3.6 install nose - -# build cython -git clone https://github.com/cython/cython.git -cd cython -git checkout 0.25.1 -python setup.py install -cd .. - -# remove the system installed numpy -pip3.6 uninstall numpy -y # install deps -pip3.6 install numpy -pip3.6 install pytz -pip3.6 install python-dateutil +pip3.6 install nose cython numpy pytz python-dateutil true From 748000d220c3c7d4daddf5811e10a5acdfce49ea Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 20:03:43 -0500 Subject: [PATCH 073/183] COMPAT: remove some deprecation warnings in 3.6 partial on #14679 Author: Jeff Reback Closes #14681 from jreback/warnings and squashes the following commits: 56e7b1b [Jeff Reback] COMPAT: remove some deprecation warnings in 3.6 --- pandas/io/tests/json/test_pandas.py | 2 +- pandas/io/tests/parser/c_parser_only.py | 8 ++--- pandas/io/tests/parser/common.py | 16 ++++----- pandas/io/tests/parser/header.py | 2 +- pandas/io/tests/parser/python_parser_only.py | 4 +-- pandas/io/tests/parser/test_unsupported.py | 10 +++--- pandas/io/tests/parser/usecols.py | 2 +- pandas/io/tests/test_clipboard.py | 2 +- pandas/io/tests/test_excel.py | 4 +-- pandas/io/tests/test_html.py | 2 +- pandas/io/tests/test_pytables.py | 2 +- pandas/sparse/tests/test_array.py | 2 +- pandas/tests/formats/test_format.py | 10 +++--- pandas/tests/frame/test_analytics.py | 2 +- pandas/tests/frame/test_constructors.py | 6 ++-- pandas/tests/frame/test_query_eval.py | 4 +-- pandas/tests/frame/test_replace.py | 4 +-- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/test_category.py | 2 +- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/indexing/test_indexing.py | 6 ++-- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/test_base.py | 2 +- pandas/tests/test_generic.py | 6 ++-- pandas/tests/test_internals.py | 2 +- pandas/tests/test_multilevel.py | 4 +-- pandas/tests/test_panel.py | 16 ++++----- pandas/tests/test_strings.py | 17 +++++----- pandas/tests/test_testing.py | 4 +-- pandas/tests/test_util.py | 34 ++++++++++---------- pandas/tseries/tests/test_base.py | 4 +-- pandas/tseries/tests/test_period.py | 14 ++++---- pandas/tseries/tests/test_timeseries.py | 8 +++-- 33 files changed, 105 insertions(+), 102 deletions(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 117ac2324d0e0..ba02e9186f1df 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -971,7 +971,7 @@ def test_to_jsonl(self): def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp( - TypeError, '\[unicode\] is not implemented as a table column') + TypeError, r'\[unicode\] is not implemented as a table column') return # GH 13774 diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 09d521e5a7e46..75b99654dbf89 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -71,11 +71,11 @@ def test_dtype_and_names_error(self): 3.0 3 """ # base cases - result = self.read_csv(StringIO(data), sep='\s+', header=None) + result = self.read_csv(StringIO(data), sep=r'\s+', header=None) expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) tm.assert_frame_equal(result, expected) - result = self.read_csv(StringIO(data), sep='\s+', + result = self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b']) expected = DataFrame( [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b']) @@ -83,7 +83,7 @@ def test_dtype_and_names_error(self): # fallback casting result = self.read_csv(StringIO( - data), sep='\s+', header=None, + data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=['a', 'b']) @@ -97,7 +97,7 @@ def test_dtype_and_names_error(self): """ # fallback casting, but not castable with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'): - self.read_csv(StringIO(data), sep='\s+', header=None, + self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) def test_passing_dtype(self): diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f0fdc9398084f..397292ec6d036 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -836,7 +836,7 @@ def test_integer_overflow_bug(self): result = self.read_csv(StringIO(data), header=None, sep=' ') self.assertTrue(result[0].dtype == np.float64) - result = self.read_csv(StringIO(data), header=None, sep='\s+') + result = self.read_csv(StringIO(data), header=None, sep=r'\s+') self.assertTrue(result[0].dtype == np.float64) def test_catch_too_many_names(self): @@ -852,7 +852,7 @@ def test_catch_too_many_names(self): def test_ignore_leading_whitespace(self): # see gh-3374, gh-6607 data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' - result = self.read_table(StringIO(data), sep='\s+') + result = self.read_table(StringIO(data), sep=r'\s+') expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected) @@ -1052,7 +1052,7 @@ def test_uneven_lines_with_usecols(self): # make sure that an error is still thrown # when the 'usecols' parameter is not provided - msg = "Expected \d+ fields in line \d+, saw \d+" + msg = r"Expected \d+ fields in line \d+, saw \d+" with tm.assertRaisesRegexp(ValueError, msg): df = self.read_csv(StringIO(csv)) @@ -1122,7 +1122,7 @@ def test_raise_on_sep_with_delim_whitespace(self): # see gh-6607 data = 'a b c\n1 2 3' with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): - self.read_table(StringIO(data), sep='\s', delim_whitespace=True) + self.read_table(StringIO(data), sep=r'\s', delim_whitespace=True) def test_single_char_leading_whitespace(self): # see gh-9710 @@ -1157,7 +1157,7 @@ def test_empty_lines(self): [-70., .4, 1.]]) df = self.read_csv(StringIO(data)) tm.assert_numpy_array_equal(df.values, expected) - df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + df = self.read_csv(StringIO(data.replace(',', ' ')), sep=r'\s+') tm.assert_numpy_array_equal(df.values, expected) expected = np.array([[1., 2., 4.], [np.nan, np.nan, np.nan], @@ -1189,14 +1189,14 @@ def test_regex_separator(self): b 1 2 3 4 c 1 2 3 4 """ - df = self.read_table(StringIO(data), sep='\s+') + df = self.read_table(StringIO(data), sep=r'\s+') expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) self.assertIsNone(expected.index.name) tm.assert_frame_equal(df, expected) data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' - result = self.read_table(StringIO(data), sep='\s+') + result = self.read_table(StringIO(data), sep=r'\s+') expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) @@ -1580,7 +1580,7 @@ def test_temporary_file(self): new_file.flush() new_file.seek(0) - result = self.read_csv(new_file, sep='\s+', header=None) + result = self.read_csv(new_file, sep=r'\s+', header=None) new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/header.py b/pandas/io/tests/parser/header.py index 33a4d71fc03b6..dc6d2ad1daa47 100644 --- a/pandas/io/tests/parser/header.py +++ b/pandas/io/tests/parser/header.py @@ -15,7 +15,7 @@ class HeaderTests(object): def test_read_with_bad_header(self): - errmsg = "but only \d+ lines in file" + errmsg = r"but only \d+ lines in file" with tm.assertRaisesRegexp(ValueError, errmsg): s = StringIO(',,') diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 3214aa39358e8..bbc1c3bab7635 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -162,7 +162,7 @@ def test_read_table_buglet_4x_multiindex(self): a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - df = self.read_table(StringIO(text), sep='\s+') + df = self.read_table(StringIO(text), sep=r'\s+') self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) # see gh-6893 @@ -170,7 +170,7 @@ def test_read_table_buglet_4x_multiindex(self): expected = DataFrame.from_records( [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], columns=list('abcABC'), index=list('abc')) - actual = self.read_table(StringIO(data), sep='\s+') + actual = self.read_table(StringIO(data), sep=r'\s+') tm.assert_frame_equal(actual, expected) def test_skipfooter_with_decimal(self): diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index ef8f7967193ff..2fc238acd54e3 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -50,7 +50,7 @@ def test_c_engine(self): read_table(StringIO(data), sep=None, delim_whitespace=False, dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep='\s', dtype={'a': float}) + read_table(StringIO(data), sep=r'\s', dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), skipfooter=1, dtype={'a': float}) @@ -59,7 +59,7 @@ def test_c_engine(self): read_table(StringIO(data), engine='c', sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), engine='c', sep='\s') + read_table(StringIO(data), engine='c', sep=r'\s') with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1) @@ -67,7 +67,7 @@ def test_c_engine(self): with tm.assert_produces_warning(parsers.ParserWarning): read_table(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): - read_table(StringIO(data), sep='\s') + read_table(StringIO(data), sep=r'\s') with tm.assert_produces_warning(parsers.ParserWarning): read_table(StringIO(data), skipfooter=1) @@ -79,9 +79,9 @@ def test_c_engine(self): msg = 'Error tokenizing data' with tm.assertRaisesRegexp(CParserError, msg): - read_table(StringIO(text), sep='\s+') + read_table(StringIO(text), sep=r'\s+') with tm.assertRaisesRegexp(CParserError, msg): - read_table(StringIO(text), engine='c', sep='\s+') + read_table(StringIO(text), engine='c', sep=r'\s+') msg = "Only length-1 thousands markers supported" data = """A|B|C diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 16a19c50be960..5051171ccb8f0 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -139,7 +139,7 @@ def test_usecols_regex_sep(self): # see gh-2733 data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b')) + df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b')) expected = DataFrame({'a': ['apple', 'orange'], 'b': ['bat', 'cow']}, index=[4, 8]) diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index 6c5ee6fcd22ba..af17945bbcf95 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -71,7 +71,7 @@ def check_round_trip_frame(self, data_type, excel=None, sep=None): def test_round_trip_frame_sep(self): for dt in self.data_types: self.check_round_trip_frame(dt, sep=',') - self.check_round_trip_frame(dt, sep='\s+') + self.check_round_trip_frame(dt, sep=r'\s+') self.check_round_trip_frame(dt, sep='|') def test_round_trip_frame_string(self): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index a4132cd69141a..49a508dd22023 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1805,8 +1805,8 @@ def wrapped(self, *args, **kwargs): if openpyxl_compat.is_compat(major_ver=major_ver): orig_method(self, *args, **kwargs) else: - msg = ('Installed openpyxl is not supported at this ' - 'time\. Use.+') + msg = (r'Installed openpyxl is not supported at this ' + r'time\. Use.+') with tm.assertRaisesRegexp(ValueError, msg): orig_method(self, *args, **kwargs) return wrapped diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 7b4e775db9476..c202c60f5213d 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -354,7 +354,7 @@ def test_regex_idempotency(self): def test_negative_skiprows(self): with tm.assertRaisesRegexp(ValueError, - '\(you passed a negative value\)'): + r'\(you passed a negative value\)'): self.read_html(self.spam_data, 'Water', skiprows=-1) @network diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 213bc53e3aab4..72973105ff3bd 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -987,7 +987,7 @@ def test_latin_encoding(self): if compat.PY2: self.assertRaisesRegexp( - TypeError, '\[unicode\] is not implemented as a table column') + TypeError, r'\[unicode\] is not implemented as a table column') return values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index dd86e9e791e5e..2b284ac631d3f 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -182,7 +182,7 @@ def test_bad_take(self): self.assertRaises(IndexError, lambda: self.arr.take(-11)) def test_take_invalid_kwargs(self): - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, self.arr.take, [2, 3], foo=2) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 3bbfd621d2342..e7c32a4baa4ea 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -89,7 +89,7 @@ def has_vertically_truncated_repr(df): r = repr(df) only_dot_row = False for row in r.splitlines(): - if re.match('^[\.\ ]+$', row): + if re.match(r'^[\.\ ]+$', row): only_dot_row = True return only_dot_row @@ -834,7 +834,7 @@ def check_with_width(df, col_space): # check that col_space affects HTML generation # and be very brittle about it. html = df.to_html(col_space=col_space) - hdrs = [x for x in html.split("\n") if re.search("\s]", x)] + hdrs = [x for x in html.split(r"\n") if re.search(r"\s]", x)] self.assertTrue(len(hdrs) > 0) for h in hdrs: self.assertTrue("min-width" in h) @@ -1940,7 +1940,7 @@ def test_to_string(self): float_format='%.5f'.__mod__) lines = result.split('\n') header = lines[0].strip().split() - joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]]) + joined = '\n'.join([re.sub(r'\s+', ' ', x).strip() for x in lines[1:]]) recons = read_table(StringIO(joined), names=header, header=None, sep=' ') tm.assert_series_equal(recons['B'], biggie['B']) @@ -3782,7 +3782,7 @@ def chck_ncols(self, s): res = repr(s) lines = res.split('\n') lines = [line for line in repr(s).split('\n') - if not re.match('[^\.]*\.+', line)][:-1] + if not re.match(r'[^\.]*\.+', line)][:-1] ncolsizes = len(set(len(line.strip()) for line in lines)) self.assertEqual(ncolsizes, 1) @@ -3823,7 +3823,7 @@ def test_max_rows_eq_one(self): def test_truncate_ndots(self): def getndots(s): - return len(re.match('[^\.]*(\.*)', s).groups()[0]) + return len(re.match(r'[^\.]*(\.*)', s).groups()[0]) s = Series([0, 2, 3, 6]) with option_context("display.max_rows", 2): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 390d796ced006..e73d3c58aea85 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -806,7 +806,7 @@ def test_sem(self): def test_sort_invalid_kwargs(self): df = DataFrame([1, 2, 3], columns=['a']) - msg = "sort\(\) got an unexpected keyword argument 'foo'" + msg = r"sort\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, df.sort, foo=2) # Neither of these should raise an error because they diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e55ba3e161ed9..489c85a7234b8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -304,7 +304,7 @@ def test_constructor_error_msgs(self): 'B': ['a', 'b', 'c']}) # wrong size ndarray, GH 3105 - msg = "Shape of passed values is \(3, 4\), indices imply \(3, 3\)" + msg = r"Shape of passed values is \(3, 4\), indices imply \(3, 3\)" with tm.assertRaisesRegexp(ValueError, msg): DataFrame(np.arange(12).reshape((4, 3)), columns=['foo', 'bar', 'baz'], @@ -316,11 +316,11 @@ def test_constructor_error_msgs(self): # wrong size axis labels with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(3, 1\)"): + r"\(3, 2\), indices imply \(3, 1\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(2, 2\)"): + r"\(3, 2\), indices imply \(2, 2\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) with tm.assertRaisesRegexp(ValueError, 'If using all scalar values, ' diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 29662c5addb75..36ae5dac733a5 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1124,8 +1124,8 @@ def test_invalid_type_for_operator_raises(self): ops = '+', '-', '*', '/' for op in ops: with tm.assertRaisesRegexp(TypeError, - "unsupported operand type\(s\) for " - ".+: '.+' and '.+'"): + r"unsupported operand type\(s\) for " + r".+: '.+' and '.+'"): df.eval('a {0} b'.format(op), engine=self.engine, parser=self.parser) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index bed0e0623ace0..3bc388da5bec8 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -550,7 +550,7 @@ def test_regex_replace_numeric_to_object_conversion(self): self.assertEqual(res.a.dtype, np.object_) def test_replace_regex_metachar(self): - metachars = '[]', '()', '\d', '\w', '\s' + metachars = '[]', '()', r'\d', r'\w', r'\s' for metachar in metachars: df = DataFrame({'a': [metachar, 'else']}) @@ -889,7 +889,7 @@ def test_replace_doesnt_replace_without_regex(self): 2 2 0 0 0 3 3 0 bt 0""" df = pd.read_csv(StringIO(raw), sep=r'\s+') - res = df.replace({'\D': 1}) + res = df.replace({r'\D': 1}) assert_frame_equal(df, res) def test_replace_bool_with_string(self): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 773f20532e4ff..1b373baf9b3c1 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -431,7 +431,7 @@ def test_take_invalid_kwargs(self): idx = self.create_index() indices = [1, 2] - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, idx.take, indices, foo=2) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c76f5ff22c534..819b88bf4c5d3 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -890,7 +890,7 @@ def test_take_invalid_kwargs(self): idx = pd.CategoricalIndex([1, 2, 3], name='foo') indices = [1, 0, -1] - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, idx.take, indices, foo=2) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index fdc5a2eaec812..61a4ea53f06fb 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1870,7 +1870,7 @@ def take_invalid_kwargs(self): idx = pd.MultiIndex.from_product(vals, names=['str', 'dt']) indices = [1, 2] - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, idx.take, indices, foo=2) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a50d3d28e5a11..9ca1fd2a76817 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1338,7 +1338,7 @@ def test_at_to_fail(self): df.columns = ['x', 'x', 'z'] # Check that we get the correct value in the KeyError - self.assertRaisesRegexp(KeyError, "\['y'\] not in index", + self.assertRaisesRegexp(KeyError, r"\['y'\] not in index", lambda: df[['x', 'y', 'z']]) def test_loc_getitem_label_slice(self): @@ -2232,7 +2232,7 @@ def f(): with tm.assertRaisesRegexp( KeyError, 'MultiIndex Slicing requires the index to be fully ' - 'lexsorted tuple len \(2\), lexsort depth \(0\)'): + r'lexsorted tuple len \(2\), lexsort depth \(0\)'): df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] def test_multiindex_slicers_non_unique(self): @@ -3646,7 +3646,7 @@ def test_mi_access(self): 5 f B 6 A2 6 """ - df = pd.read_csv(StringIO(data), sep='\s+', index_col=0) + df = pd.read_csv(StringIO(data), sep=r'\s+', index_col=0) df2 = df.set_index(['main', 'sub']).T.sort_index(1) index = Index(['h1', 'h3', 'h5']) columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub']) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 24e3a0ff5f325..6de1a68464436 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1618,7 +1618,7 @@ def test_reshape_bad_kwarg(self): tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - msg = "reshape\(\) got an unexpected keyword argument 'foo'" + msg = r"reshape\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) def test_numpy_reshape(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index eaa316bfd8157..da8cf120b8ed4 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -900,7 +900,7 @@ def test_duplicated_drop_duplicates_index(self): tm.assert_index_equal(result, idx[~expected]) with tm.assertRaisesRegexp( - TypeError, "drop_duplicates\(\) got an unexpected " + TypeError, r"drop_duplicates\(\) got an unexpected " "keyword argument"): idx.drop_duplicates(inplace=True) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index cdcd8b1bcba60..84df82db69f77 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1740,8 +1740,8 @@ def test_numpy_squeeze(self): np.squeeze, s, axis=0) def test_transpose(self): - msg = ("transpose\(\) got multiple values for " - "keyword argument 'axes'") + msg = (r"transpose\(\) got multiple values for " + r"keyword argument 'axes'") for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: # calls implementation in pandas/core/base.py @@ -1831,7 +1831,7 @@ def test_take_invalid_kwargs(self): p4d = tm.makePanel4D() for obj in (s, df, p, p4d): - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, obj.take, indices, foo=2) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 6a97f195abba7..db1c8da4cae73 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -82,7 +82,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): values = (mat * 1e9).astype('M8[ns]') elif typestr.startswith('M8[ns'): # datetime with tz - m = re.search('M8\[ns,\s*(\w+\/?\w*)\]', typestr) + m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr) assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 516c406f8d54f..4e7ace4173227 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -554,7 +554,7 @@ def test_xs_level_multiple(self): a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - df = read_table(StringIO(text), sep='\s+', engine='python') + df = read_table(StringIO(text), sep=r'\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') @@ -588,7 +588,7 @@ def test_xs_level0(self): a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - df = read_table(StringIO(text), sep='\s+', engine='python') + df = read_table(StringIO(text), sep=r'\s+', engine='python') result = df.xs('a', level=0) expected = df.xs('a') diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index a197037789fd2..9cb2dd5a40ac4 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -496,8 +496,8 @@ def test_setitem(self): # bad shape p = Panel(np.random.randn(4, 3, 2)) with tm.assertRaisesRegexp(ValueError, - "shape of value must be \(3, 2\), " - "shape of given object was \(4, 2\)"): + r"shape of value must be \(3, 2\), " + r"shape of given object was \(4, 2\)"): p[0] = np.random.randn(4, 2) def test_setitem_ndarray(self): @@ -1128,24 +1128,24 @@ def testit(): Panel(np.random.randn(3, 4, 5), lrange(4), lrange(5), lrange(5)) assertRaisesRegexp(ValueError, - "Shape of passed values is \(3, 4, 5\), " - "indices imply \(4, 5, 5\)", + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(4, 5, 5\)", testit) def testit(): Panel(np.random.randn(3, 4, 5), lrange(5), lrange(4), lrange(5)) assertRaisesRegexp(ValueError, - "Shape of passed values is \(3, 4, 5\), " - "indices imply \(5, 4, 5\)", + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(5, 4, 5\)", testit) def testit(): Panel(np.random.randn(3, 4, 5), lrange(5), lrange(5), lrange(4)) assertRaisesRegexp(ValueError, - "Shape of passed values is \(3, 4, 5\), " - "indices imply \(5, 5, 4\)", + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(5, 5, 4\)", testit) def test_conform(self): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9a3505c3421e0..bbcd856250c51 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -426,7 +426,7 @@ def test_replace(self): # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) - result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) + result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) # GH 13438 @@ -670,12 +670,12 @@ def check_index(index): data = ['A1', 'B2', 'C'] index = index[:len(data)] s = Series(data, index=index) - result = s.str.extract('(\d)', expand=False) + result = s.str.extract(r'(\d)', expand=False) exp = Series(['1', '2', NA], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( - '(?P\D)(?P\d)?', expand=False) + r'(?P\D)(?P\d)?', expand=False) e_list = [ ['A', '1'], ['B', '2'], @@ -828,12 +828,13 @@ def test_extract_optional_groups(self): def check_index(index): data = ['A1', 'B2', 'C'] index = index[:len(data)] - result = Series(data, index=index).str.extract('(\d)', expand=True) + result = Series(data, index=index).str.extract( + r'(\d)', expand=True) exp = DataFrame(['1', '2', NA], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( - '(?P\D)(?P\d)?', expand=True) + r'(?P\D)(?P\d)?', expand=True) e_list = [ ['A', '1'], ['B', '2'], @@ -1023,7 +1024,7 @@ def test_extractall_no_matches(self): def test_extractall_stringindex(self): s = Series(["a1a2", "b1", "c1"], name='xxx') - res = s.str.extractall("[ab](?P\d)") + res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, 'match']) exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) @@ -1034,12 +1035,12 @@ def test_extractall_stringindex(self): for idx in [Index(["a1a2", "b1", "c1"]), Index(["a1a2", "b1", "c1"], name='xxx')]: - res = idx.str.extractall("[ab](?P\d)") + res = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(res, exp) s = Series(["a1a2", "b1", "c1"], name='s_name', index=Index(["XX", "yy", "zz"], name='idx_name')) - res = s.str.extractall("[ab](?P\d)") + res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", 'match']) exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index c242213ee226f..7a217ed9dbd86 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -319,10 +319,10 @@ def test_numpy_array_equal_copy_flag(self): a = np.array([1, 2, 3]) b = a.copy() c = a.view() - expected = 'array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)' + expected = r'array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)' with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(a, b, check_same='same') - expected = 'array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)' + expected = r'array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)' with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(a, c, check_same='copy') diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index f5828dab21e37..ee33e24c7f6c4 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -97,8 +97,8 @@ def test_bad_arg_length_max_value_single(self): min_fname_arg_count = 0 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = ("{fname}\(\) takes at most {max_length} " - "argument \({actual_length} given\)" + msg = (r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)" .format(fname=self.fname, max_length=max_length, actual_length=actual_length)) @@ -114,8 +114,8 @@ def test_bad_arg_length_max_value_multiple(self): min_fname_arg_count = 2 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = ("{fname}\(\) takes at most {max_length} " - "arguments \({actual_length} given\)" + msg = (r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)" .format(fname=self.fname, max_length=max_length, actual_length=actual_length)) @@ -127,7 +127,7 @@ def test_bad_arg_length_max_value_multiple(self): def test_not_all_defaults(self): bad_arg = 'foo' msg = ("the '{arg}' parameter is not supported " - "in the pandas implementation of {func}\(\)". + r"in the pandas implementation of {func}\(\)". format(arg=bad_arg, func=self.fname)) compat_args = OrderedDict() @@ -163,8 +163,8 @@ def test_bad_kwarg(self): compat_args[goodarg] = 'foo' compat_args[badarg + 'o'] = 'bar' kwargs = {goodarg: 'foo', badarg: 'bar'} - msg = ("{fname}\(\) got an unexpected " - "keyword argument '{arg}'".format( + msg = (r"{fname}\(\) got an unexpected " + r"keyword argument '{arg}'".format( fname=self.fname, arg=badarg)) with tm.assertRaisesRegexp(TypeError, msg): @@ -172,8 +172,8 @@ def test_bad_kwarg(self): def test_not_all_none(self): bad_arg = 'foo' - msg = ("the '{arg}' parameter is not supported " - "in the pandas implementation of {func}\(\)". + msg = (r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)". format(arg=bad_arg, func=self.fname)) compat_args = OrderedDict() @@ -212,8 +212,8 @@ def test_invalid_total_length_max_length_one(self): min_fname_arg_count = 0 max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = ("{fname}\(\) takes at most {max_length} " - "argument \({actual_length} given\)" + msg = (r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)" .format(fname=self.fname, max_length=max_length, actual_length=actual_length)) @@ -230,8 +230,8 @@ def test_invalid_total_length_max_length_multiple(self): min_fname_arg_count = 2 max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = ("{fname}\(\) takes at most {max_length} " - "arguments \({actual_length} given\)" + msg = (r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)" .format(fname=self.fname, max_length=max_length, actual_length=actual_length)) @@ -248,8 +248,8 @@ def test_no_args_with_kwargs(self): compat_args['foo'] = -5 compat_args[bad_arg] = 1 - msg = ("the '{arg}' parameter is not supported " - "in the pandas implementation of {func}\(\)". + msg = (r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)". format(arg=bad_arg, func=self.fname)) args = () @@ -275,8 +275,8 @@ def test_duplicate_argument(self): kwargs = {'foo': None, 'bar': None} args = (None,) # duplicate value for 'foo' - msg = ("{fname}\(\) got multiple values for keyword " - "argument '{arg}'".format(fname=self.fname, arg='foo')) + msg = (r"{fname}\(\) got multiple values for keyword " + r"argument '{arg}'".format(fname=self.fname, arg='foo')) with tm.assertRaisesRegexp(TypeError, msg): validate_args_and_kwargs(self.fname, args, kwargs, diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index a6d58fa3e7ef3..bca50237081e1 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -807,7 +807,7 @@ def test_take_invalid_kwargs(self): idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') indices = [1, 6, 5, 9, 10, 13, 15, 3] - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, idx.take, indices, foo=2) @@ -1639,7 +1639,7 @@ def test_take_invalid_kwargs(self): idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') indices = [1, 6, 5, 9, 10, 13, 15, 3] - msg = "take\(\) got an unexpected keyword argument 'foo'" + msg = r"take\(\) got an unexpected keyword argument 'foo'" tm.assertRaisesRegexp(TypeError, msg, idx.take, indices, foo=2) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index e314081eac373..9bdf420ca6084 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3730,11 +3730,11 @@ def test_add_raises(self): # GH 4731 dt1 = Period(freq='D', year=2008, month=1, day=1) dt2 = Period(freq='D', year=2008, month=1, day=2) - msg = "unsupported operand type\(s\)" + msg = r"unsupported operand type\(s\)" with tm.assertRaisesRegexp(TypeError, msg): dt1 + "str" - msg = "unsupported operand type\(s\)" + msg = r"unsupported operand type\(s\)" with tm.assertRaisesRegexp(TypeError, msg): "str" + dt1 @@ -3748,7 +3748,7 @@ def test_sub(self): self.assertEqual(dt1 - dt2, -14) self.assertEqual(dt2 - dt1, 14) - msg = "Input has different freq=M from Period\(freq=D\)" + msg = r"Input has different freq=M from Period\(freq=D\)" with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): dt1 - pd.Period('2011-02', freq='M') @@ -4112,7 +4112,7 @@ def test_period_ops_offset(self): exp = pd.Period('2011-03-30', freq='D') self.assertEqual(result, exp) - msg = "Input cannot be converted to Period\(freq=D\)" + msg = r"Input cannot be converted to Period\(freq=D\)" with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): p + offsets.Hour(2) @@ -4161,7 +4161,7 @@ def test_pi_ops_errors(self): '2011-04'], freq='M', name='idx') s = pd.Series(idx) - msg = "unsupported operand type\(s\)" + msg = r"unsupported operand type\(s\)" for obj in [idx, s]: for ng in ["str", 1.5]: @@ -4265,8 +4265,8 @@ def test_pi_offset_errors(self): # Series op is applied per Period instance, thus error is raised # from Period - msg_idx = "Input has different freq from PeriodIndex\(freq=D\)" - msg_s = "Input cannot be converted to Period\(freq=D\)" + msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" + msg_s = r"Input cannot be converted to Period\(freq=D\)" for obj, msg in [(idx, msg_idx), (s, msg_s)]: with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): obj + offsets.Hour(2) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index aa8a5d10cd9d3..67b203d011d1a 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -5160,11 +5160,13 @@ def test_partial_slice_doesnt_require_monotonicity(self): timestamp = pd.Timestamp('2014-01-10') assert_series_equal(nonmonotonic['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, "Timestamp\('2014-01-10 00:00:00'\)", + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", lambda: nonmonotonic[timestamp:]) assert_series_equal(nonmonotonic.ix['2014-01-10':], expected) - self.assertRaisesRegexp(KeyError, "Timestamp\('2014-01-10 00:00:00'\)", + self.assertRaisesRegexp(KeyError, + r"Timestamp\('2014-01-10 00:00:00'\)", lambda: nonmonotonic.ix[timestamp:]) @@ -5284,7 +5286,7 @@ def test_to_datetime_with_non_exact(self): s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', '19MAY11 00:00:00Z']) result = to_datetime(s, format='%d%b%y', exact=False) - expected = to_datetime(s.str.extract('(\d+\w+\d+)', expand=False), + expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), format='%d%b%y') assert_series_equal(result, expected) From bec5bdb89179cf637e5101ed7106e986570e7f95 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 18 Nov 2016 06:07:34 -0500 Subject: [PATCH 074/183] BUG: fix pickling of Custom offsets in 3.6 xref #14679 Author: Jeff Reback Closes #14685 from jreback/offsets and squashes the following commits: 8ad212c [Jeff Reback] BUG: fix pickling of Custom offsets in 3.6 --- pandas/tseries/offsets.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2e3852a7edddd..efcde100d1ce7 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -553,6 +553,32 @@ def _repr_attrs(self): out += ': ' + ', '.join(attrs) return out + def __getstate__(self): + """Return a pickleable state""" + state = self.__dict__.copy() + + # we don't want to actually pickle the calendar object + # as its a np.busyday; we recreate on deserilization + if 'calendar' in state: + del state['calendar'] + try: + state['kwds'].pop('calendar') + except KeyError: + pass + + return state + + def __setstate__(self, state): + """Reconstruct an instance from a pickled state""" + self.__dict__ = state + if 'weekmask' in state and 'holidays' in state: + calendar, holidays = self.get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) + self.kwds['calendar'] = self.calendar = calendar + self.kwds['holidays'] = self.holidays = holidays + self.kwds['weekmask'] = state['weekmask'] + class BusinessDay(BusinessMixin, SingleConstructorOffset): """ @@ -992,30 +1018,6 @@ def get_calendar(self, weekmask, holidays, calendar): busdaycalendar = np.busdaycalendar(**kwargs) return busdaycalendar, holidays - def __getstate__(self): - """Return a pickleable state""" - state = self.__dict__.copy() - del state['calendar'] - - # we don't want to actually pickle the calendar object - # as its a np.busyday; we recreate on deserilization - try: - state['kwds'].pop('calendar') - except: - pass - - return state - - def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" - self.__dict__ = state - calendar, holidays = self.get_calendar(weekmask=self.weekmask, - holidays=self.holidays, - calendar=None) - self.kwds['calendar'] = self.calendar = calendar - self.kwds['holidays'] = self.holidays = holidays - self.kwds['weekmask'] = state['weekmask'] - @apply_wraps def apply(self, other): if self.n <= 0: From b6ffd89fa5cfd889d0b5428bec05d3018765441d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 17 Nov 2016 20:34:45 -0500 Subject: [PATCH 075/183] ERR: more informative message on invalid Timestamp input TST: fix unordable error message xref #14679 TST: handle unorderable exceptions in indexing closes #14684 --- doc/source/whatsnew/v0.19.2.txt | 6 ++++++ pandas/compat/__init__.py | 1 + pandas/core/indexing.py | 3 ++- pandas/core/series.py | 3 ++- pandas/tests/indexes/test_base.py | 21 +++++++++++++++++---- pandas/tslib.pyx | 3 ++- pandas/types/common.py | 17 ++++++++++++++++- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index f4a45a6938a95..499a7e734f616 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -41,5 +41,11 @@ Bug Fixes - Bug in not propogating exceptions in parsing invalid datetimes, noted in python 3.6 (:issue:`14561`) +- Compat with python 3.6 for pickling of some offsets (:issue:`14685`) +- Compat with python 3.6 for some indexing exception types (:issue:`14684`) +- Compat with python 3.6 for deprecation warnings in the test suite (:issue:`14681`) + + + - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 1b8930dcae0f1..532f960468204 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -41,6 +41,7 @@ PY2 = sys.version_info[0] == 2 PY3 = (sys.version_info[0] >= 3) PY35 = (sys.version_info >= (3, 5)) +PY36 = (sys.version_info >= (3, 6)) try: import __builtin__ as builtins diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 35fcf0d49d0d6..660e8c9446202 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,6 +11,7 @@ is_sequence, is_scalar, is_sparse, + _is_unorderable_exception, _ensure_platform_int) from pandas.types.missing import isnull, _infer_fill_value @@ -1411,7 +1412,7 @@ def error(): except TypeError as e: # python 3 type errors should be raised - if 'unorderable' in str(e): # pragma: no cover + if _is_unorderable_exception(e): error() raise except: diff --git a/pandas/core/series.py b/pandas/core/series.py index 2310e75f3d3fa..105e39562f561 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -25,6 +25,7 @@ is_iterator, is_dict_like, is_scalar, + _is_unorderable_exception, _ensure_platform_int) from pandas.types.generic import ABCSparseArray, ABCDataFrame from pandas.types.cast import (_maybe_upcast, _infer_dtype_from_scalar, @@ -753,7 +754,7 @@ def setitem(key, value): raise ValueError("Can only tuple-index with a MultiIndex") # python 3 type errors should be raised - if 'unorderable' in str(e): # pragma: no cover + if _is_unorderable_exception(e): raise IndexError(key) if com.is_bool_indexer(key): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ad7e3890b5f32..329e85d82122e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -7,7 +7,7 @@ from .common import Base from pandas.compat import (is_platform_windows, range, lrange, lzip, u, - zip, PY3) + zip, PY3, PY36) import operator import os @@ -1774,7 +1774,12 @@ def create_index(self): def test_order(self): idx = self.create_index() # 9816 deprecated - if PY3: + if PY36: + with tm.assertRaisesRegexp(TypeError, "'>' not supported " + "between instances of 'str' and 'int'"): + with tm.assert_produces_warning(FutureWarning): + idx.order() + elif PY3: with tm.assertRaisesRegexp(TypeError, "unorderable types"): with tm.assert_produces_warning(FutureWarning): idx.order() @@ -1784,7 +1789,11 @@ def test_order(self): def test_argsort(self): idx = self.create_index() - if PY3: + if PY36: + with tm.assertRaisesRegexp(TypeError, "'>' not supported " + "between instances of 'str' and 'int'"): + result = idx.argsort() + elif PY3: with tm.assertRaisesRegexp(TypeError, "unorderable types"): result = idx.argsort() else: @@ -1794,7 +1803,11 @@ def test_argsort(self): def test_numpy_argsort(self): idx = self.create_index() - if PY3: + if PY36: + with tm.assertRaisesRegexp(TypeError, "'>' not supported " + "between instances of 'str' and 'int'"): + result = np.argsort(idx) + elif PY3: with tm.assertRaisesRegexp(TypeError, "unorderable types"): result = np.argsort(idx) else: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 91d3f0ef70cfe..e05363de2983a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1539,7 +1539,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, "Cannot convert Period to Timestamp " "unambiguously. Use to_timestamp") else: - raise TypeError('Cannot convert input to Timestamp') + raise TypeError('Cannot convert input [{}] of type {} to ' + 'Timestamp'.format(ts, type(ts))) if obj.value != NPY_NAT: _check_dts_bounds(&obj.dts) diff --git a/pandas/types/common.py b/pandas/types/common.py index e0e4501738745..691e15610867b 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -1,7 +1,8 @@ """ common type operations """ import numpy as np -from pandas.compat import string_types, text_type, binary_type +from pandas.compat import (string_types, text_type, binary_type, + PY3, PY36) from pandas import lib, algos from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, @@ -188,6 +189,20 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): return issubclass(tipo, (np.datetime64, np.timedelta64)) +def _is_unorderable_exception(e): + """ + return a boolean if we an unorderable exception error message + + These are different error message for PY>=3<=3.5 and PY>=3.6 + """ + if PY36: + return ("'>' not supported between instances " + "of 'str' and 'int'" in str(e)) + elif PY3: + return 'unorderable' in str(e) + return False + + def is_numeric_v_string_like(a, b): """ numpy doesn't like to compare numeric arrays vs scalar string-likes From 2d8160e8c46b8d636c6c66027d310866fa4c5908 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 18 Nov 2016 06:25:39 -0500 Subject: [PATCH 076/183] COMPAT: pickle compat for Timestamp in py3.6 BUG: fix unorderable exception types in py3.6 closes #14689 --- doc/source/whatsnew/v0.19.2.txt | 3 ++- pandas/tslib.pyx | 6 ++++++ pandas/types/common.py | 4 ++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 499a7e734f616..8ec4a35b0da17 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -42,8 +42,9 @@ Bug Fixes - Compat with python 3.6 for pickling of some offsets (:issue:`14685`) -- Compat with python 3.6 for some indexing exception types (:issue:`14684`) +- Compat with python 3.6 for some indexing exception types (:issue:`14684`, :issue:`14689`) - Compat with python 3.6 for deprecation warnings in the test suite (:issue:`14681`) +- Compat with python 3.6 for Timestamp pickles (:issue:`14689`) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index e05363de2983a..acc0e45562cf2 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1105,6 +1105,12 @@ cdef class _Timestamp(datetime): self._assert_tzawareness_compat(other) return _cmp_scalar(self.value, ots.value, op) + def __reduce_ex__(self, protocol): + # python 3.6 compat + # http://bugs.python.org/issue28730 + # now __reduce_ex__ is defined and higher priority than __reduce__ + return self.__reduce__() + def __repr__(self): stamp = self._repr_base zone = None diff --git a/pandas/types/common.py b/pandas/types/common.py index 691e15610867b..754ff80924c07 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -196,8 +196,8 @@ def _is_unorderable_exception(e): These are different error message for PY>=3<=3.5 and PY>=3.6 """ if PY36: - return ("'>' not supported between instances " - "of 'str' and 'int'" in str(e)) + return "'>' not supported between instances of" in str(e) + elif PY3: return 'unorderable' in str(e) return False From dca0185388d7dce3b1f9e39955c209de1184836a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 18 Nov 2016 07:25:47 -0500 Subject: [PATCH 077/183] DOC: compat notice for 3.6 --- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.19.2.txt | 4 ++++ doc/source/whatsnew/v0.20.0.txt | 3 +-- setup.py | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 923c22aa9048f..55b6b5fa69efb 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 2.7, 3.4, and 3.5 +Officially Python 2.7, 3.4, 3.5, and 3.6 Installing pandas ----------------- diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 8ec4a35b0da17..4e2c6e2faeaa5 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -7,6 +7,10 @@ This is a minor bug-fix release from 0.19.1 and includes some small regression f bug fixes and performance improvements. We recommend that all users upgrade to this version. +Highlights include: + +- Compatibility with Python 3.6 + .. contents:: What's new in v0.19.2 :local: :backlinks: none diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8819a95f27b0d..d0e6781fd6e42 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,7 +12,7 @@ Highlights include: Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. -.. contents:: What's new in v0.19.0 +.. contents:: What's new in v0.20.0 :local: :backlinks: none @@ -80,4 +80,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - diff --git a/setup.py b/setup.py index 351d2b39ce6aa..a982ccb8e9463 100755 --- a/setup.py +++ b/setup.py @@ -244,6 +244,7 @@ def build_extensions(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Cython', 'Topic :: Scientific/Engineering', ] From c045e1d6774aaa32ce13def79901f1a1ad8792bf Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 18 Nov 2016 08:38:10 -0500 Subject: [PATCH 078/183] API: Rename CParserError to ParserError (#14479) Partially resolves gh-12665. We will remove CParserError in the future. --- doc/source/io.rst | 4 ++-- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/common.py | 8 +++++--- pandas/io/parsers.py | 4 ++-- pandas/io/tests/parser/common.py | 2 +- pandas/io/tests/parser/test_textreader.py | 6 +++--- pandas/io/tests/parser/test_unsupported.py | 10 +++++----- pandas/io/tests/test_common.py | 18 ++++++++++++++++++ pandas/io/tests/test_html.py | 4 ++-- pandas/parser.pyx | 17 ++++++++++------- pandas/tests/frame/test_to_csv.py | 4 ++-- 11 files changed, 51 insertions(+), 27 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ba1bd328d2991..ee319092c6dd5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1165,8 +1165,8 @@ too many will cause an error by default: In [28]: pd.read_csv(StringIO(data)) --------------------------------------------------------------------------- - CParserError Traceback (most recent call last) - CParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 + ParserError Traceback (most recent call last) + ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 You can elect to skip bad lines: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d0e6781fd6e42..581106924c77e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -41,6 +41,7 @@ Backwards incompatible API changes .. _whatsnew_0200.api: +- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 127ebc4839fd3..7076d5a62b626 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -65,13 +65,15 @@ def urlopen(*args, **kwargs): _VALID_URLS.discard('') -class CParserError(ValueError): +class ParserError(ValueError): """ - Exception that is thrown by the C engine when it encounters - a parsing error in `pd.read_csv` + Exception that is thrown by an error is encountered in `pd.read_csv` """ pass +# gh-12665: Alias for now and remove later. +CParserError = ParserError + class DtypeWarning(Warning): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 090a21632cddb..092cba093421a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -26,7 +26,7 @@ from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, - BaseIterator, CParserError, EmptyDataError, + BaseIterator, ParserError, EmptyDataError, ParserWarning, _NA_VALUES) from pandas.tseries import tools @@ -1141,7 +1141,7 @@ def tostr(x): # long for n in range(len(columns[0])): if all(['Unnamed' in tostr(c[n]) for c in columns]): - raise CParserError( + raise ParserError( "Passed header=[%s] are too many rows for this " "multi_index of columns" % ','.join([str(x) for x in self.header]) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 397292ec6d036..4cb00c48976a4 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -50,7 +50,7 @@ def test_bad_stream_exception(self): # Issue 13652: # This test validates that both python engine # and C engine will raise UnicodeDecodeError instead of - # c engine raising CParserError and swallowing exception + # c engine raising ParserError and swallowing exception # that caused read to fail. handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index 7dda9eb9d0af4..49b70fc5e8703 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -154,7 +154,7 @@ def test_skip_bad_lines(self): reader = TextReader(StringIO(data), delimiter=':', header=None) - self.assertRaises(parser.CParserError, reader.read) + self.assertRaises(parser.ParserError, reader.read) reader = TextReader(StringIO(data), delimiter=':', header=None, @@ -197,7 +197,7 @@ def test_header_not_enough_lines(self): assert_array_dicts_equal(expected, recs) # not enough rows - self.assertRaises(parser.CParserError, TextReader, StringIO(data), + self.assertRaises(parser.ParserError, TextReader, StringIO(data), delimiter=',', header=5, as_recarray=True) def test_header_not_enough_lines_as_recarray(self): @@ -218,7 +218,7 @@ def test_header_not_enough_lines_as_recarray(self): assert_array_dicts_equal(expected, recs) # not enough rows - self.assertRaises(parser.CParserError, TextReader, StringIO(data), + self.assertRaises(parser.ParserError, TextReader, StringIO(data), delimiter=',', header=5, as_recarray=True) def test_escapechar(self): diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 2fc238acd54e3..5d60c20854a83 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -15,7 +15,7 @@ import pandas.util.testing as tm from pandas.compat import StringIO -from pandas.io.common import CParserError +from pandas.io.common import ParserError from pandas.io.parsers import read_csv, read_table @@ -78,10 +78,10 @@ def test_c_engine(self): x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" msg = 'Error tokenizing data' - with tm.assertRaisesRegexp(CParserError, msg): - read_table(StringIO(text), sep=r'\s+') - with tm.assertRaisesRegexp(CParserError, msg): - read_table(StringIO(text), engine='c', sep=r'\s+') + with tm.assertRaisesRegexp(ParserError, msg): + read_table(StringIO(text), sep='\s+') + with tm.assertRaisesRegexp(ParserError, msg): + read_table(StringIO(text), engine='c', sep='\s+') msg = "Only length-1 thousands markers supported" data = """A|B|C diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index c08d235b07c9e..3c980cae3351a 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -11,6 +11,7 @@ from pandas.compat import is_platform_windows, StringIO from pandas import read_csv, concat +import pandas as pd try: from pathlib import Path @@ -88,6 +89,23 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(concat(it), expected.iloc[1:]) + def test_error_rename(self): + # see gh-12665 + try: + raise common.CParserError() + except common.ParserError: + pass + + try: + raise common.ParserError() + except common.CParserError: + pass + + try: + raise common.ParserError() + except pd.parser.CParserError: + pass + class TestMMapWrapper(tm.TestCase): diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index c202c60f5213d..f4eec864da572 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -23,7 +23,7 @@ is_platform_windows) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html -from pandas.parser import CParserError +from pandas.parser import ParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -652,7 +652,7 @@ def test_parse_dates_combine(self): def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') - with tm.assertRaisesRegexp(CParserError, r"Passed header=\[0,1\] are " + with tm.assertRaisesRegexp(ParserError, r"Passed header=\[0,1\] are " "too many rows for this multi_index " "of columns"): self.read_html(data, header=[0, 1]) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 93a494c176b99..9fb99637731be 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -13,8 +13,11 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import CParserError, DtypeWarning, EmptyDataError +from io.common import ParserError, DtypeWarning, EmptyDataError +# Import CParserError as alias of ParserError for backwards compatibility. +# Ultimately, we want to remove this import. See gh-12665 and gh-14479. +from io.common import CParserError cdef extern from "Python.h": object PyUnicode_FromString(char *v) @@ -719,7 +722,7 @@ cdef class TextReader: if isinstance(msg, list): msg = "[%s], len of %d," % ( ','.join([ str(m) for m in msg ]), len(msg)) - raise CParserError( + raise ParserError( 'Passed header=%s but only %d lines in file' % (msg, self.parser.lines)) @@ -812,7 +815,7 @@ cdef class TextReader: passed_count = len(header[0]) # if passed_count > field_count: - # raise CParserError('Column names have %d fields, ' + # raise ParserError('Column names have %d fields, ' # 'data has %d fields' # % (passed_count, field_count)) @@ -1004,7 +1007,7 @@ cdef class TextReader: (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise CParserError( + raise ParserError( "Too many columns specified: expected %s and found %s" % (self.table_width - self.leading_cols, num_cols)) @@ -1059,7 +1062,7 @@ cdef class TextReader: self.use_unsigned) if col_res is None: - raise CParserError('Unable to parse column %d' % i) + raise ParserError('Unable to parse column %d' % i) results[i] = col_res @@ -1310,7 +1313,7 @@ def _is_file_like(obj): if PY3: import io if isinstance(obj, io.TextIOWrapper): - raise CParserError('Cannot handle open unicode files (yet)') + raise ParserError('Cannot handle open unicode files (yet)') # BufferedReader is a byte reader for Python 3 file = io.BufferedReader @@ -2015,7 +2018,7 @@ cdef raise_parser_error(object base, parser_t *parser): else: message += 'no error message set' - raise CParserError(message) + raise ParserError(message) def _concatenate_chunks(list chunks): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 6d09378ca864e..4d6a5bb32038d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) -from pandas.parser import CParserError +from pandas.parser import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, date_range, read_csv, compat, to_datetime) import pandas as pd @@ -589,7 +589,7 @@ def _make_frame(names=None): for i in [5, 6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) - with assertRaisesRegexp(CParserError, msg): + with assertRaisesRegexp(ParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) From 4a1a330343f31fc8ca62b98a84978475968cba9c Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Fri, 18 Nov 2016 17:36:36 -0500 Subject: [PATCH 079/183] BUG in clipboard (linux, python2) with unicode and separator (GH13747) vendered updated version of Pyperclip closes #13747 closes #14362 closes #12807 closes #12529 Author: Ajay Saxena Author: Ajay Saxena Closes #14599 from aileronajay/master and squashes the following commits: 2aafb66 [Ajay Saxena] moved comment inside test and added github issue labels to test b74fbc1 [Ajay Saxena] ignore lint test for pyperclip files 9db42d8 [Ajay Saxena] whatsnew conflict 1dca292 [Ajay Saxena] conflict resolution 98b61e8 [Ajay Saxena] merge conflict cedb690 [Ajay Saxena] merge conflict in whats new file 7af95da [Ajay Saxena] merging lastest changes ac8ae60 [Ajay Saxena] skip clipboard test if clipboard primitives are absent b03ed56 [Ajay Saxena] changed whatsnew file c0aafd7 [Ajay Saxena] Merge branch 'test_branch' 9946fb7 [Ajay Saxena] Merge branch 'master' of https://github.com/pandas-dev/pandas into test_branch ed1375f [Ajay Saxena] Merge branch 'test_branch' 0665fd4 [Ajay Saxena] fixed linting and test case as per code review d202fd0 [Ajay Saxena] added test for valid encoding, modified setup.py so that pandas/util/clipboard can be found dd57ae3 [Ajay Saxena] code review changes and read clipboard invalid encoding test 71d58d0 [Ajay Saxena] testing encoding in kwargs to to_clipboard and test case for the same 02f87b0 [Ajay Saxena] removed duplicate files 825bbe2 [Ajay Saxena] all files related to pyperclip are under pandas.util.clipboard c5a87d8 [Ajay Saxena] Merge branch 'test_branch' of https://github.com/aileronajay/pandas into test_branch f708c2e [Ajay Saxena] Merge branch 'master' of https://github.com/aileronajay/pandas d565b1f [Ajay Saxena] updated pyperclip to the latest version 14d94a0 [Ajay Saxena] changed the pandas util clipboard file to return unicode if the python version is 2, else str 66d8ebf [Ajay Saxena] removed the disabled tag for clipboard test so that we can check if they pass after this change edb8553 [Ajay Saxena] refactored the new unicode test to be in sync with the rest of the file c83d000 [Ajay Saxena] added test case for unicode round trip fb922d6 [Ajay Saxena] changes for GH 13747 --- doc/source/whatsnew/v0.19.2.txt | 10 ++ pandas/io/clipboard.py | 24 ++- pandas/io/tests/test_clipboard.py | 35 +++- pandas/util/clipboard.py | 266 ---------------------------- pandas/util/clipboard/__init__.py | 110 ++++++++++++ pandas/util/clipboard/clipboards.py | 136 ++++++++++++++ pandas/util/clipboard/exceptions.py | 12 ++ pandas/util/clipboard/windows.py | 152 ++++++++++++++++ setup.py | 3 +- 9 files changed, 469 insertions(+), 279 deletions(-) delete mode 100644 pandas/util/clipboard.py create mode 100644 pandas/util/clipboard/__init__.py create mode 100644 pandas/util/clipboard/clipboards.py create mode 100644 pandas/util/clipboard/exceptions.py create mode 100644 pandas/util/clipboard/windows.py diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 4e2c6e2faeaa5..ecbd6e9b3b288 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -53,4 +53,14 @@ Bug Fixes + + +- Bug in clipboard functions on linux with python2 with unicode and separators (:issue:`13747`) +- Bug in clipboard functions on Windows 10 and python 3 (:issue:`14362`, :issue:`12807`) +- Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`) + + + + + - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index 6f6f1366a6732..3c7ac528d83fd 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -1,6 +1,6 @@ """ io on the clipboard """ from pandas import compat, get_option, option_context, DataFrame -from pandas.compat import StringIO +from pandas.compat import StringIO, PY2 def read_clipboard(sep='\s+', **kwargs): # pragma: no cover @@ -18,6 +18,14 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover ------- parsed : DataFrame """ + encoding = kwargs.pop('encoding', 'utf-8') + + # only utf-8 is valid for passed value because that's what clipboard + # supports + if encoding is not None and encoding.lower().replace('-', '') != 'utf8': + raise NotImplementedError( + 'reading from clipboard only supports utf-8 encoding') + from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() @@ -78,6 +86,12 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover - Windows: - OS X: """ + encoding = kwargs.pop('encoding', 'utf-8') + + # testing if an invalid encoding is passed to clipboard + if encoding is not None and encoding.lower().replace('-', '') != 'utf8': + raise ValueError('clipboard only supports utf-8 encoding') + from pandas.util.clipboard import clipboard_set if excel is None: excel = True @@ -87,8 +101,12 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover if sep is None: sep = '\t' buf = StringIO() - obj.to_csv(buf, sep=sep, **kwargs) - clipboard_set(buf.getvalue()) + # clipboard_set (pyperclip) expects unicode + obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) + text = buf.getvalue() + if PY2: + text = text.decode('utf-8') + clipboard_set(text) return except: pass diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index af17945bbcf95..93d14077aeacf 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -9,16 +9,16 @@ from pandas import read_clipboard from pandas import get_option from pandas.util import testing as tm -from pandas.util.testing import makeCustomDataframe as mkdf, disabled +from pandas.util.testing import makeCustomDataframe as mkdf +from pandas.util.clipboard.exceptions import PyperclipException try: - import pandas.util.clipboard # noqa -except OSError: - raise nose.SkipTest("no clipboard found") + DataFrame({'A': [1, 2]}).to_clipboard() +except PyperclipException: + raise nose.SkipTest("clipboard primitives not installed") -@disabled class TestClipboard(tm.TestCase): @classmethod @@ -52,6 +52,9 @@ def setUpClass(cls): # Test for non-ascii text: GH9263 cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), 'es': 'en español'.split()}) + # unicode round trip test for GH 13747, GH 12529 + cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], + 'b': ['øπ∆˚¬', 'œ∑´®']}) cls.data_types = list(cls.data.keys()) @classmethod @@ -59,13 +62,14 @@ def tearDownClass(cls): super(TestClipboard, cls).tearDownClass() del cls.data_types, cls.data - def check_round_trip_frame(self, data_type, excel=None, sep=None): + def check_round_trip_frame(self, data_type, excel=None, sep=None, + encoding=None): data = self.data[data_type] - data.to_clipboard(excel=excel, sep=sep) + data.to_clipboard(excel=excel, sep=sep, encoding=encoding) if sep is not None: - result = read_clipboard(sep=sep, index_col=0) + result = read_clipboard(sep=sep, index_col=0, encoding=encoding) else: - result = read_clipboard() + result = read_clipboard(encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) def test_round_trip_frame_sep(self): @@ -115,3 +119,16 @@ def test_read_clipboard_infer_excel(self): exp = pd.read_clipboard() tm.assert_frame_equal(res, exp) + + def test_invalid_encoding(self): + # test case for testing invalid encoding + data = self.data['string'] + with tm.assertRaises(ValueError): + data.to_clipboard(encoding='ascii') + with tm.assertRaises(NotImplementedError): + pd.read_clipboard(encoding='ascii') + + def test_round_trip_valid_encodings(self): + for enc in ['UTF-8', 'utf-8', 'utf8']: + for dt in self.data_types: + self.check_round_trip_frame(dt, encoding=enc) diff --git a/pandas/util/clipboard.py b/pandas/util/clipboard.py deleted file mode 100644 index 02da0d5b8159f..0000000000000 --- a/pandas/util/clipboard.py +++ /dev/null @@ -1,266 +0,0 @@ -# Pyperclip v1.5.15 -# A cross-platform clipboard module for Python. -# By Al Sweigart al@inventwithpython.com - -# Usage: -# import pyperclip -# pyperclip.copy('The text to be copied to the clipboard.') -# spam = pyperclip.paste() - -# On Windows, no additional modules are needed. -# On Mac, this module makes use of the pbcopy and pbpaste commands, which -# should come with the os. -# On Linux, this module makes use of the xclip or xsel commands, which should -# come with the os. Otherwise run "sudo apt-get install xclip" or -# "sudo apt-get install xsel" -# Otherwise on Linux, you will need the gtk or PyQt4 modules installed. -# The gtk module is not available for Python 3, and this module does not work -# with PyGObject yet. - - -# Copyright (c) 2015, Albert Sweigart -# All rights reserved. -# -# BSD-style license: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the pyperclip nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# flake8: noqa - -import platform -import os -from subprocess import call, Popen, PIPE - -PY2 = '2' == platform.python_version_tuple()[0] -text_type = unicode if PY2 else str - - -class NoClipboardProgramError(OSError): - pass - - -def _pasteWindows(): - CF_UNICODETEXT = 13 - d = ctypes.windll - d.user32.OpenClipboard(0) - handle = d.user32.GetClipboardData(CF_UNICODETEXT) - data = ctypes.c_wchar_p(handle).value - d.user32.CloseClipboard() - return data - - -def _copyWindows(text): - GMEM_DDESHARE = 0x2000 - CF_UNICODETEXT = 13 - d = ctypes.windll # cdll expects 4 more bytes in user32.OpenClipboard(0) - if not isinstance(text, text_type): - text = text.decode('mbcs') - - d.user32.OpenClipboard(0) - - d.user32.EmptyClipboard() - hCd = d.kernel32.GlobalAlloc(GMEM_DDESHARE, - len(text.encode('utf-16-le')) + 2) - pchData = d.kernel32.GlobalLock(hCd) - ctypes.cdll.msvcrt.wcscpy(ctypes.c_wchar_p(pchData), text) - d.kernel32.GlobalUnlock(hCd) - d.user32.SetClipboardData(CF_UNICODETEXT, hCd) - d.user32.CloseClipboard() - - -def _pasteCygwin(): - CF_UNICODETEXT = 13 - d = ctypes.cdll - d.user32.OpenClipboard(0) - handle = d.user32.GetClipboardData(CF_UNICODETEXT) - data = ctypes.c_wchar_p(handle).value - d.user32.CloseClipboard() - return data - - -def _copyCygwin(text): - GMEM_DDESHARE = 0x2000 - CF_UNICODETEXT = 13 - d = ctypes.cdll - if not isinstance(text, text_type): - text = text.decode('mbcs') - d.user32.OpenClipboard(0) - d.user32.EmptyClipboard() - hCd = d.kernel32.GlobalAlloc(GMEM_DDESHARE, - len(text.encode('utf-16-le')) + 2) - pchData = d.kernel32.GlobalLock(hCd) - ctypes.cdll.msvcrt.wcscpy(ctypes.c_wchar_p(pchData), text) - d.kernel32.GlobalUnlock(hCd) - d.user32.SetClipboardData(CF_UNICODETEXT, hCd) - d.user32.CloseClipboard() - - -def _copyOSX(text): - p = Popen(['pbcopy', 'w'], stdin=PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) - - -def _pasteOSX(): - p = Popen(['pbpaste', 'r'], stdout=PIPE, close_fds=True) - stdout, stderr = p.communicate() - return stdout.decode('utf-8') - - -def _pasteGtk(): - return gtk.Clipboard().wait_for_text() - - -def _copyGtk(text): - global cb - cb = gtk.Clipboard() - cb.set_text(text) - cb.store() - - -def _pasteQt(): - return str(cb.text()) - - -def _copyQt(text): - cb.setText(text) - - -def _copyXclip(text): - p = Popen(['xclip', '-selection', 'c'], stdin=PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) - - -def _pasteXclip(): - p = Popen(['xclip', '-selection', 'c', '-o'], stdout=PIPE, close_fds=True) - stdout, stderr = p.communicate() - return stdout.decode('utf-8') - - -def _copyXsel(text): - p = Popen(['xsel', '-b', '-i'], stdin=PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) - - -def _pasteXsel(): - p = Popen(['xsel', '-b', '-o'], stdout=PIPE, close_fds=True) - stdout, stderr = p.communicate() - return stdout.decode('utf-8') - - -def _copyKlipper(text): - p = Popen(['qdbus', 'org.kde.klipper', '/klipper', - 'setClipboardContents', text.encode('utf-8')], - stdin=PIPE, close_fds=True) - p.communicate(input=None) - - -def _pasteKlipper(): - p = Popen(['qdbus', 'org.kde.klipper', '/klipper', - 'getClipboardContents'], stdout=PIPE, close_fds=True) - stdout, stderr = p.communicate() - return stdout.decode('utf-8') - - -# Determine the OS/platform and set the copy() and paste() functions -# accordingly. -if 'cygwin' in platform.system().lower(): - _functions = 'Cygwin' # for debugging - import ctypes - paste = _pasteCygwin - copy = _copyCygwin -elif os.name == 'nt' or platform.system() == 'Windows': - _functions = 'Windows' # for debugging - import ctypes - paste = _pasteWindows - copy = _copyWindows -elif os.name == 'mac' or platform.system() == 'Darwin': - _functions = 'OS X pbcopy/pbpaste' # for debugging - paste = _pasteOSX - copy = _copyOSX -elif os.name == 'posix' or platform.system() == 'Linux': - # Determine which command/module is installed, if any. - xclipExists = call(['which', 'xclip'], - stdout=PIPE, stderr=PIPE) == 0 - - xselExists = call(['which', 'xsel'], - stdout=PIPE, stderr=PIPE) == 0 - - xklipperExists = ( - call(['which', 'klipper'], stdout=PIPE, stderr=PIPE) == 0 and - call(['which', 'qdbus'], stdout=PIPE, stderr=PIPE) == 0 - ) - - gtkInstalled = False - try: - # Check it gtk is installed. - import gtk - gtkInstalled = True - except ImportError: - pass - - if not gtkInstalled: - # Check for either PyQt4 or PySide - qtBindingInstalled = True - try: - from PyQt4 import QtGui - except ImportError: - try: - from PySide import QtGui - except ImportError: - qtBindingInstalled = False - - # Set one of the copy & paste functions. - if xclipExists: - _functions = 'xclip command' # for debugging - paste = _pasteXclip - copy = _copyXclip - elif xklipperExists: - _functions = '(KDE Klipper) - qdbus (external)' # for debugging - paste = _pasteKlipper - copy = _copyKlipper - elif gtkInstalled: - _functions = 'gtk module' # for debugging - paste = _pasteGtk - copy = _copyGtk - elif qtBindingInstalled: - _functions = 'PyQt4 module' # for debugging - app = QtGui.QApplication([]) - cb = QtGui.QApplication.clipboard() - paste = _pasteQt - copy = _copyQt - elif xselExists: - # TODO: xsel doesn't seem to work on Raspberry Pi (my test Linux - # environment). Putting this as the last method tried. - _functions = 'xsel command' # for debugging - paste = _pasteXsel - copy = _copyXsel - else: - raise NoClipboardProgramError('Pyperclip requires the gtk, PyQt4, or ' - 'PySide module installed, or either the ' - 'xclip or xsel command.') -else: - raise RuntimeError('pyperclip does not support your system.') - -# pandas aliases -clipboard_get = paste -clipboard_set = copy diff --git a/pandas/util/clipboard/__init__.py b/pandas/util/clipboard/__init__.py new file mode 100644 index 0000000000000..358c9b5f8035a --- /dev/null +++ b/pandas/util/clipboard/__init__.py @@ -0,0 +1,110 @@ +""" +Pyperclip + +A cross-platform clipboard module for Python. (only handles plain text for now) +By Al Sweigart al@inventwithpython.com +BSD License + +Usage: + import pyperclip + pyperclip.copy('The text to be copied to the clipboard.') + spam = pyperclip.paste() + + if not pyperclip.copy: + print("Copy functionality unavailable!") + +On Windows, no additional modules are needed. +On Mac, the module uses pbcopy and pbpaste, which should come with the os. +On Linux, install xclip or xsel via package manager. For example, in Debian: +sudo apt-get install xclip + +Otherwise on Linux, you will need the gtk or PyQt4 modules installed. + +gtk and PyQt4 modules are not available for Python 3, +and this module does not work with PyGObject yet. +""" +__version__ = '1.5.27' + +# flake8: noqa + +import platform +import os +import subprocess +from .clipboards import (init_osx_clipboard, + init_gtk_clipboard, init_qt_clipboard, + init_xclip_clipboard, init_xsel_clipboard, + init_klipper_clipboard, init_no_clipboard) +from .windows import init_windows_clipboard + +# `import PyQt4` sys.exit()s if DISPLAY is not in the environment. +# Thus, we need to detect the presence of $DISPLAY manually +# and not load PyQt4 if it is absent. +HAS_DISPLAY = os.getenv("DISPLAY", False) +CHECK_CMD = "where" if platform.system() == "Windows" else "which" + + +def _executable_exists(name): + return subprocess.call([CHECK_CMD, name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 + + +def determine_clipboard(): + # Determine the OS/platform and set + # the copy() and paste() functions accordingly. + if 'cygwin' in platform.system().lower(): + # FIXME: pyperclip currently does not support Cygwin, + # see https://github.com/asweigart/pyperclip/issues/55 + pass + elif os.name == 'nt' or platform.system() == 'Windows': + return init_windows_clipboard() + if os.name == 'mac' or platform.system() == 'Darwin': + return init_osx_clipboard() + if HAS_DISPLAY: + # Determine which command/module is installed, if any. + try: + import gtk # check if gtk is installed + except ImportError: + pass + else: + return init_gtk_clipboard() + + try: + import PyQt4 # check if PyQt4 is installed + except ImportError: + pass + else: + return init_qt_clipboard() + + if _executable_exists("xclip"): + return init_xclip_clipboard() + if _executable_exists("xsel"): + return init_xsel_clipboard() + if _executable_exists("klipper") and _executable_exists("qdbus"): + return init_klipper_clipboard() + + return init_no_clipboard() + + +def set_clipboard(clipboard): + global copy, paste + + clipboard_types = {'osx': init_osx_clipboard, + 'gtk': init_gtk_clipboard, + 'qt': init_qt_clipboard, + 'xclip': init_xclip_clipboard, + 'xsel': init_xsel_clipboard, + 'klipper': init_klipper_clipboard, + 'windows': init_windows_clipboard, + 'no': init_no_clipboard} + + copy, paste = clipboard_types[clipboard]() + + +copy, paste = determine_clipboard() + +__all__ = ["copy", "paste"] + + +# pandas aliases +clipboard_get = paste +clipboard_set = copy \ No newline at end of file diff --git a/pandas/util/clipboard/clipboards.py b/pandas/util/clipboard/clipboards.py new file mode 100644 index 0000000000000..182a685f956e6 --- /dev/null +++ b/pandas/util/clipboard/clipboards.py @@ -0,0 +1,136 @@ +# flake8: noqa + +import sys +import subprocess +from .exceptions import PyperclipException + +EXCEPT_MSG = """ + Pyperclip could not find a copy/paste mechanism for your system. + For more information, please visit https://pyperclip.readthedocs.org """ +PY2 = sys.version_info[0] == 2 +text_type = unicode if PY2 else str + + +def init_osx_clipboard(): + def copy_osx(text): + p = subprocess.Popen(['pbcopy', 'w'], + stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode('utf-8')) + + def paste_osx(): + p = subprocess.Popen(['pbpaste', 'r'], + stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + return stdout.decode('utf-8') + + return copy_osx, paste_osx + + +def init_gtk_clipboard(): + import gtk + + def copy_gtk(text): + global cb + cb = gtk.Clipboard() + cb.set_text(text) + cb.store() + + def paste_gtk(): + clipboardContents = gtk.Clipboard().wait_for_text() + # for python 2, returns None if the clipboard is blank. + if clipboardContents is None: + return '' + else: + return clipboardContents + + return copy_gtk, paste_gtk + + +def init_qt_clipboard(): + # $DISPLAY should exist + from PyQt4.QtGui import QApplication + + app = QApplication([]) + + def copy_qt(text): + cb = app.clipboard() + cb.setText(text) + + def paste_qt(): + cb = app.clipboard() + return text_type(cb.text()) + + return copy_qt, paste_qt + + +def init_xclip_clipboard(): + def copy_xclip(text): + p = subprocess.Popen(['xclip', '-selection', 'c'], + stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode('utf-8')) + + def paste_xclip(): + p = subprocess.Popen(['xclip', '-selection', 'c', '-o'], + stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + return stdout.decode('utf-8') + + return copy_xclip, paste_xclip + + +def init_xsel_clipboard(): + def copy_xsel(text): + p = subprocess.Popen(['xsel', '-b', '-i'], + stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode('utf-8')) + + def paste_xsel(): + p = subprocess.Popen(['xsel', '-b', '-o'], + stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + return stdout.decode('utf-8') + + return copy_xsel, paste_xsel + + +def init_klipper_clipboard(): + def copy_klipper(text): + p = subprocess.Popen( + ['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents', + text.encode('utf-8')], + stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=None) + + def paste_klipper(): + p = subprocess.Popen( + ['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'], + stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + + # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 + # TODO: https://github.com/asweigart/pyperclip/issues/43 + clipboardContents = stdout.decode('utf-8') + # even if blank, Klipper will append a newline at the end + assert len(clipboardContents) > 0 + # make sure that newline is there + assert clipboardContents.endswith('\n') + if clipboardContents.endswith('\n'): + clipboardContents = clipboardContents[:-1] + return clipboardContents + + return copy_klipper, paste_klipper + + +def init_no_clipboard(): + class ClipboardUnavailable(object): + def __call__(self, *args, **kwargs): + raise PyperclipException(EXCEPT_MSG) + + if PY2: + def __nonzero__(self): + return False + else: + def __bool__(self): + return False + + return ClipboardUnavailable(), ClipboardUnavailable() diff --git a/pandas/util/clipboard/exceptions.py b/pandas/util/clipboard/exceptions.py new file mode 100644 index 0000000000000..615335f3a58da --- /dev/null +++ b/pandas/util/clipboard/exceptions.py @@ -0,0 +1,12 @@ +# flake8: noqa +import ctypes + + +class PyperclipException(RuntimeError): + pass + + +class PyperclipWindowsException(PyperclipException): + def __init__(self, message): + message += " (%s)" % ctypes.WinError() + super(PyperclipWindowsException, self).__init__(message) diff --git a/pandas/util/clipboard/windows.py b/pandas/util/clipboard/windows.py new file mode 100644 index 0000000000000..956d5b9d34025 --- /dev/null +++ b/pandas/util/clipboard/windows.py @@ -0,0 +1,152 @@ +# flake8: noqa +""" +This module implements clipboard handling on Windows using ctypes. +""" +import time +import contextlib +import ctypes +from ctypes import c_size_t, sizeof, c_wchar_p, get_errno, c_wchar +from .exceptions import PyperclipWindowsException + + +class CheckedCall(object): + def __init__(self, f): + super(CheckedCall, self).__setattr__("f", f) + + def __call__(self, *args): + ret = self.f(*args) + if not ret and get_errno(): + raise PyperclipWindowsException("Error calling " + self.f.__name__) + return ret + + def __setattr__(self, key, value): + setattr(self.f, key, value) + + +def init_windows_clipboard(): + from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND, + HINSTANCE, HMENU, BOOL, UINT, HANDLE) + + windll = ctypes.windll + + safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) + safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT, + INT, INT, HWND, HMENU, HINSTANCE, LPVOID] + safeCreateWindowExA.restype = HWND + + safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) + safeDestroyWindow.argtypes = [HWND] + safeDestroyWindow.restype = BOOL + + OpenClipboard = windll.user32.OpenClipboard + OpenClipboard.argtypes = [HWND] + OpenClipboard.restype = BOOL + + safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard) + safeCloseClipboard.argtypes = [] + safeCloseClipboard.restype = BOOL + + safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard) + safeEmptyClipboard.argtypes = [] + safeEmptyClipboard.restype = BOOL + + safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData) + safeGetClipboardData.argtypes = [UINT] + safeGetClipboardData.restype = HANDLE + + safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData) + safeSetClipboardData.argtypes = [UINT, HANDLE] + safeSetClipboardData.restype = HANDLE + + safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc) + safeGlobalAlloc.argtypes = [UINT, c_size_t] + safeGlobalAlloc.restype = HGLOBAL + + safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock) + safeGlobalLock.argtypes = [HGLOBAL] + safeGlobalLock.restype = LPVOID + + safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock) + safeGlobalUnlock.argtypes = [HGLOBAL] + safeGlobalUnlock.restype = BOOL + + GMEM_MOVEABLE = 0x0002 + CF_UNICODETEXT = 13 + + @contextlib.contextmanager + def window(): + """ + Context that provides a valid Windows hwnd. + """ + # we really just need the hwnd, so setting "STATIC" + # as predefined lpClass is just fine. + hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0, + None, None, None, None) + try: + yield hwnd + finally: + safeDestroyWindow(hwnd) + + @contextlib.contextmanager + def clipboard(hwnd): + """ + Context manager that opens the clipboard and prevents + other applications from modifying the clipboard content. + """ + # We may not get the clipboard handle immediately because + # some other application is accessing it (?) + # We try for at least 500ms to get the clipboard. + t = time.time() + 0.5 + success = False + while time.time() < t: + success = OpenClipboard(hwnd) + if success: + break + time.sleep(0.01) + if not success: + raise PyperclipWindowsException("Error calling OpenClipboard") + + try: + yield + finally: + safeCloseClipboard() + + def copy_windows(text): + # This function is heavily based on + # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard + with window() as hwnd: + # http://msdn.com/ms649048 + # If an application calls OpenClipboard with hwnd set to NULL, + # EmptyClipboard sets the clipboard owner to NULL; + # this causes SetClipboardData to fail. + # => We need a valid hwnd to copy something. + with clipboard(hwnd): + safeEmptyClipboard() + + if text: + # http://msdn.com/ms649051 + # If the hMem parameter identifies a memory object, + # the object must have been allocated using the + # function with the GMEM_MOVEABLE flag. + count = len(text) + 1 + handle = safeGlobalAlloc(GMEM_MOVEABLE, + count * sizeof(c_wchar)) + locked_handle = safeGlobalLock(handle) + + ctypes.memmove(c_wchar_p(locked_handle), c_wchar_p(text), count * sizeof(c_wchar)) + + safeGlobalUnlock(handle) + safeSetClipboardData(CF_UNICODETEXT, handle) + + def paste_windows(): + with clipboard(None): + handle = safeGetClipboardData(CF_UNICODETEXT) + if not handle: + # GetClipboardData may return NULL with errno == NO_ERROR + # if the clipboard is empty. + # (Also, it may return a handle to an empty buffer, + # but technically that's not empty) + return "" + return c_wchar_p(handle).value + + return copy_windows, paste_windows diff --git a/setup.py b/setup.py index a982ccb8e9463..2dd3fec150781 100755 --- a/setup.py +++ b/setup.py @@ -644,7 +644,8 @@ def pxd(name): 'pandas.io.tests.parser', 'pandas.io.tests.sas', 'pandas.stats.tests', - 'pandas.msgpack' + 'pandas.msgpack', + 'pandas.util.clipboard' ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', From f26b049786624ed983f2718687c23e3f1adbb670 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Thu, 18 Aug 2016 10:19:17 +0200 Subject: [PATCH 080/183] COMPAT: Require a problem description in issues Currently some issues are just code examples without a description **why** a change should be done. This leads to problems when (years later) the current behaviour is questioned and no one can remember why it was changed. closes #14032 --- .github/ISSUE_TEMPLATE.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c7d731249f9cf..1f614b54b1f71 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,9 +1,12 @@ -#### A small, complete example of the issue +#### Code Sample, a copy-pastable example if possible ```python # Your code here ``` +#### Problem description + +[this should explain **why** the current behaviour is a problem and why the expected output is a better solution.] #### Expected Output From f60964092d3348e432764f53e01b06ff9ef601e9 Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Tue, 22 Nov 2016 06:13:25 -0500 Subject: [PATCH 081/183] BUG: Fix move_into_mutable_buffer for python 3.6. In python 3.6, the CALL_FUNCTION handling was updated. One change is that when calling a C function from a python function python now counts the reference owned by the argument tuple. This means that move was always seeing objects with two references instead of the expected one. Python 3.6 also removed a copy in the argument tuple when *unpacking functions. This means that if a user does: tuple = (create_string(),) move_into_mutable_buffer(*tuple) where create_string() creates a string object with one reference then we will fail to raise a BadMove even though the user could later retrieve that string with tuple[0]. There is no way to detect this case so this patch adds a warning to the docstring advising against star unpacking. xref: #14679 I played around with removing the extra reference that was added in 3.6 but it looks like playing with borrowed refs everywhere will be a bit tricky. This change should clear things up for 3.6 while continuing to work for older versions. In 3.6 you __could__ get a shared mutable string from this but you need to try pretty hard for it. Author: Joe Jevnik Closes #14695 from llllllllll/move-3.6-compat and squashes the following commits: d1e8b1b [Joe Jevnik] BUG: Fix move_into_mutable_buffer for python 3.6. --- pandas/tests/test_util.py | 10 ++- pandas/util/move.c | 170 ++++++++++++++++++-------------------- 2 files changed, 89 insertions(+), 91 deletions(-) diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index ee33e24c7f6c4..cb12048676d26 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -5,7 +5,7 @@ import sys import unittest from uuid import uuid4 -from pandas.util._move import move_into_mutable_buffer, BadMove +from pandas.util._move import move_into_mutable_buffer, BadMove, stolenbuf from pandas.util.decorators import deprecate_kwarg from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) @@ -299,6 +299,14 @@ def test_validation(self): class TestMove(tm.TestCase): + def test_cannot_create_instance_of_stolenbuffer(self): + """Stolen buffers need to be created through the smart constructor + ``move_into_mutable_buffer`` which has a bunch of checks in it. + """ + msg = "cannot create 'pandas.util._move.stolenbuf' instances" + with tm.assertRaisesRegexp(TypeError, msg): + stolenbuf() + def test_more_than_one_ref(self): """Test case for when we try to use ``move_into_mutable_buffer`` when the object being moved has other references. diff --git a/pandas/util/move.c b/pandas/util/move.c index fb918c302b100..9a8af5bbfbdf6 100644 --- a/pandas/util/move.c +++ b/pandas/util/move.c @@ -88,54 +88,37 @@ PyBufferProcs stolenbuf_as_buffer = { #endif /* COMPILING_IN_PY2 */ -static PyObject * -stolenbuf_new(PyObject *self, PyObject *args, PyObject *kwargs) -{ - stolenbufobject *ret; - PyObject *bytes_rvalue; - - if (kwargs && PyDict_Size(kwargs)) { - PyErr_SetString(PyExc_TypeError, - "stolenbuf does not accept keyword arguments"); - return NULL; - } - - if (PyTuple_GET_SIZE(args) != 1) { - PyErr_SetString(PyExc_TypeError, - "stolenbuf requires exactly 1 positional argument"); - return NULL; - - } - - /* pull out the single, positional argument */ - bytes_rvalue = PyTuple_GET_ITEM(args, 0); - - if (!PyString_CheckExact(bytes_rvalue)) { - PyErr_SetString(PyExc_TypeError, - "stolenbuf can only steal from bytes objects"); - return NULL; - } - - if (Py_REFCNT(bytes_rvalue) != 1 || PyString_CHECK_INTERNED(bytes_rvalue)) { - /* there is a reference other than the caller's stack or the string is - interned */ - PyErr_SetObject(badmove, bytes_rvalue); - return NULL; - } - - if (!(ret = PyObject_New(stolenbufobject, &stolenbuf_type))) { - return NULL; - } +PyDoc_STRVAR(stolenbuf_doc, + "A buffer that is wrapping a stolen bytes object's buffer."); - /* store the original bytes object in a field that is not - exposed to python */ - Py_INCREF(bytes_rvalue); - ret->invalid_bytes = bytes_rvalue; - return (PyObject*) ret; -} +PyTypeObject stolenbuf_type = { + PyVarObject_HEAD_INIT(NULL, 0) + "pandas.util._move.stolenbuf", /* tp_name */ + sizeof(stolenbufobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) stolenbuf_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + &stolenbuf_as_buffer, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | + Py_TPFLAGS_HAVE_NEWBUFFER | + Py_TPFLAGS_HAVE_GETCHARBUFFER, /* tp_flags */ + stolenbuf_doc, /* tp_doc */ +}; PyDoc_STRVAR( - stolenbuf_doc, + move_into_mutable_buffer_doc, "Moves a bytes object that is about to be destroyed into a mutable buffer\n" "without copying the data.\n" "\n" @@ -159,49 +142,55 @@ PyDoc_STRVAR( "\n" "Notes\n" "-----\n" - "If you want to use this function you are probably wrong.\n"); + "If you want to use this function you are probably wrong.\n" + "\n" + "Warning: Do not call this function through *unpacking. This can\n" + "potentially trick the reference checks which may allow you to get a\n" + "mutable reference to a shared string!\n" + "\n"); + +/* This is implemented as a standalone function instead of the ``tp_new`` of + ``stolenbuf`` because we need to create a function using the METH_O flag + to support Python 3.6. In python 3.6, PyCFunction calls from python code now + count the reference owned by the argument tuple. This would cause the object + to have 2 references if used with a direct call like: ``stolenbuf(a)``; + however, if called through *unpacking like ``stolenbuf(*(a,))`` it would + only have the one reference (the tuple). */ +static PyObject* +move_into_mutable_buffer(PyObject *self, PyObject *bytes_rvalue) +{ + stolenbufobject *ret; -PyTypeObject stolenbuf_type = { - PyVarObject_HEAD_INIT(NULL, 0) - "pandas.util._move.stolenbuf", /* tp_name */ - sizeof(stolenbufobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) stolenbuf_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - &stolenbuf_as_buffer, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | - Py_TPFLAGS_HAVE_NEWBUFFER | - Py_TPFLAGS_HAVE_GETCHARBUFFER, /* tp_flags */ - stolenbuf_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - (newfunc) stolenbuf_new, /* tp_new */ + if (!PyString_CheckExact(bytes_rvalue)) { + PyErr_SetString(PyExc_TypeError, + "stolenbuf can only steal from bytes objects"); + return NULL; + } + + if (Py_REFCNT(bytes_rvalue) != 1 || PyString_CHECK_INTERNED(bytes_rvalue)) { + /* there is a reference other than the caller's stack or the string is + interned */ + PyErr_SetObject(badmove, bytes_rvalue); + return NULL; + } + + if (!(ret = PyObject_New(stolenbufobject, &stolenbuf_type))) { + return NULL; + } + + /* store the original bytes object in a field that is not + exposed to python */ + Py_INCREF(bytes_rvalue); + ret->invalid_bytes = bytes_rvalue; + return (PyObject*) ret; +} + +PyMethodDef methods[] = { + {"move_into_mutable_buffer", + (PyCFunction) move_into_mutable_buffer, + METH_O, + move_into_mutable_buffer_doc}, + {NULL}, }; #define MODULE_NAME "pandas.util._move" @@ -212,6 +201,7 @@ PyModuleDef _move_module = { MODULE_NAME, NULL, -1, + methods, }; #endif /* !COMPILING_IN_PY2 */ @@ -223,7 +213,7 @@ PyDoc_STRVAR( "Parameters\n" "----------\n" "data : any\n" - " The data which was passed to ``_move_into_mutable_buffer``.\n" + " The data which was passed to ``move_into_mutable_buffer``.\n" "\n" "See Also\n" "--------\n" @@ -254,14 +244,14 @@ init_move(void) #if !COMPILING_IN_PY2 if (!(m = PyModule_Create(&_move_module))) #else - if (!(m = Py_InitModule(MODULE_NAME, NULL))) + if (!(m = Py_InitModule(MODULE_NAME, methods))) #endif /* !COMPILING_IN_PY2 */ { return ERROR_RETURN; } if (PyModule_AddObject(m, - "move_into_mutable_buffer", + "stolenbuf", (PyObject*) &stolenbuf_type)) { Py_DECREF(m); return ERROR_RETURN; From f862b52e752e7f9003ca754179dec7503fccfffa Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Tue, 22 Nov 2016 06:18:56 -0500 Subject: [PATCH 082/183] BUG: Fix parse empty df closes #14515 This commit fixes a bug where `read_csv` failed when given a file with a multiindex header and empty content. Because pandas reads index names as a separate line following the header lines, the reader looks for the line with index names in it. If the content of the dataframe is empty, the reader will choke. This bug surfaced after https://github.com/pandas-dev/pandas/issues/6618 stopped writing an extra line after multiindex columns, which led to a situation where pandas could write CSV's that it couldn't then read. This commit changes that behavior by explicitly checking if the index name row exists, and processing it correctly if it doesn't. Author: Ben Kandel Closes #14596 from bkandel/fix-parse-empty-df and squashes the following commits: 32e3b0a [Ben Kandel] lint e6b1237 [Ben Kandel] lint fedfff8 [Ben Kandel] fix multiindex column parsing 518982d [Ben Kandel] move to 0.19.2 fc23e5c [Ben Kandel] fix errant this_columns 3d9bbdd [Ben Kandel] whatsnew 68eadf3 [Ben Kandel] Modify test. 17e44dd [Ben Kandel] fix python parser too 72adaf2 [Ben Kandel] remove unnecessary test bfe0423 [Ben Kandel] typo 2f64d57 [Ben Kandel] pep8 b8200e4 [Ben Kandel] BUG: read_csv with empty df --- doc/source/whatsnew/v0.19.2.txt | 2 +- pandas/io/parsers.py | 16 +++++++++++++--- pandas/io/tests/parser/common.py | 22 ++++++++++++++++++++++ pandas/parser.pyx | 6 ++++-- pandas/tests/frame/test_to_csv.py | 2 +- 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index ecbd6e9b3b288..1a399dcda1add 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -29,7 +29,7 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - +- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 092cba093421a..3fe5e5e826ebd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1509,10 +1509,11 @@ def read(self, nrows=None): if self._first_chunk: self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta( names, self.index_col, self.index_names, dtype=self.kwds.get('dtype')) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1979,8 +1980,11 @@ def read(self, rows=None): if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) - return _get_empty_meta(names, self.index_col, - self.index_names) + index, columns, col_dict = _get_empty_meta( + names, self.index_col, self.index_names) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) + return index, columns, col_dict # handle new style for names in index count_empty_content_vals = count_empty_vals(content[0]) @@ -2083,6 +2087,12 @@ def _infer_columns(self): # We have an empty file, so check # if columns are provided. That will # serve as the 'line' for parsing + if have_mi_columns and hr > 0: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(columns[-1])) + return columns, num_original_columns + if not self.names: raise EmptyDataError( "No columns to parse from file") diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4cb00c48976a4..6eb73876c11dd 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self): expected = self.read_csv(StringIO(data), index_col=[1, 0]) tm.assert_frame_equal(df, expected, check_names=False) + def test_multi_index_blank_df(self): + # GH 14545 + data = """a,b +""" + df = self.read_csv(StringIO(data), header=[0]) + expected = DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(df, expected) + round_trip = self.read_csv(StringIO( + expected.to_csv(index=False)), header=[0]) + tm.assert_frame_equal(round_trip, expected) + + data_multiline = """a,b +c,d +""" + df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) + cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) + expected2 = DataFrame(columns=cols) + tm.assert_frame_equal(df2, expected2) + round_trip = self.read_csv(StringIO( + expected2.to_csv(index=False)), header=[0, 1]) + tm.assert_frame_equal(round_trip, expected2) + def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 9fb99637731be..6b43dfbabc4a0 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -717,7 +717,9 @@ cdef class TextReader: start = self.parser.line_start[0] # e.g., if header=3 and file only has 2 lines - elif self.parser.lines < hr + 1: + elif (self.parser.lines < hr + 1 + and not isinstance(self.orig_header, list)) or ( + self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): msg = "[%s], len of %d," % ( @@ -940,7 +942,7 @@ cdef class TextReader: raise_parser_error('Error tokenizing data', self.parser) footer = self.skipfooter - if self.parser_start == self.parser.lines: + if self.parser_start >= self.parser.lines: raise StopIteration self._end_clock('Tokenization') diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4d6a5bb32038d..1eb3454519ce3 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -587,7 +587,7 @@ def _make_frame(names=None): df = _make_frame(True) df.to_csv(path, tupleize_cols=False) - for i in [5, 6, 7]: + for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with assertRaisesRegexp(ParserError, msg): read_csv(path, tupleize_cols=False, From 9f2e45378cbce5532a8edf2484d62a802369634e Mon Sep 17 00:00:00 2001 From: Julian Santander Date: Tue, 22 Nov 2016 06:24:52 -0500 Subject: [PATCH 083/183] BUG: Avoid AmbiguousTimeError on groupby closes #14682 Author: Julian Santander Author: Julian Santander Closes #14683 from j-santander/master and squashes the following commits: d90afaf [Julian Santander] Addressing additional code inspection comments 817ed97 [Julian Santander] Addressing code inspections comments 99a5367 [Julian Santander] Fix unittest error and lint warning 940fb22 [Julian Santander] Avoid AmbiguousTimeError on groupby --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/tseries/index.py | 2 +- pandas/tseries/resample.py | 16 ++++++++++----- pandas/tseries/tests/test_resample.py | 28 ++++++++++++++++++++++++++- 4 files changed, 40 insertions(+), 7 deletions(-) mode change 100644 => 100755 pandas/tseries/resample.py mode change 100644 => 100755 pandas/tseries/tests/test_resample.py diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 1a399dcda1add..5a255d1e62043 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -49,6 +49,7 @@ Bug Fixes - Compat with python 3.6 for some indexing exception types (:issue:`14684`, :issue:`14689`) - Compat with python 3.6 for deprecation warnings in the test suite (:issue:`14681`) - Compat with python 3.6 for Timestamp pickles (:issue:`14689`) +- Bug in resampling a ``DatetimeIndex`` in local TZ, covering a DST change, which would raise ``AmbiguousTimeError`` (:issue:`14682`) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 70e2d2c121773..024306edef2d8 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -439,7 +439,7 @@ def _generate(cls, start, end, periods, name, offset, tz = tz.localize(date.replace(tzinfo=None)).tzinfo if tz is not None and inferred_tz is not None: - if not inferred_tz == tz: + if not tslib.get_timezone(inferred_tz) == tslib.get_timezone(tz): raise AssertionError("Inferred time zone not equal to passed " "time zone") diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py old mode 100644 new mode 100755 index d02c403cb3c66..31781eb3fc131 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1283,9 +1283,18 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): # # See https://github.com/pandas-dev/pandas/issues/8683 + # 14682 - Since we need to drop the TZ information to perform + # the adjustment in the presence of a DST change, + # save TZ Info and the DST state of the first and last parameters + # so that we can accurately rebuild them at the end. first_tzinfo = first.tzinfo + last_tzinfo = last.tzinfo + first_dst = bool(first.dst()) + last_dst = bool(last.dst()) + first = first.tz_localize(None) last = last.tz_localize(None) + start_day_nanos = first.normalize().value base_nanos = (base % offset.n) * offset.nanos // offset.n @@ -1320,11 +1329,8 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): else: lresult = last.value + offset.nanos -# return (Timestamp(fresult, tz=first.tz), -# Timestamp(lresult, tz=last.tz)) - - return (Timestamp(fresult).tz_localize(first_tzinfo), - Timestamp(lresult).tz_localize(first_tzinfo)) + return (Timestamp(fresult).tz_localize(first_tzinfo, ambiguous=first_dst), + Timestamp(lresult).tz_localize(last_tzinfo, ambiguous=last_dst)) def asfreq(obj, freq, method=None, how=None, normalize=False): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py old mode 100644 new mode 100755 index 9d3d27f3224b4..b8c060c024867 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1912,7 +1912,33 @@ def test_resample_size(self): right = Series(val, index=ix) assert_series_equal(left, right) - def test_resmaple_dst_anchor(self): + def test_resample_across_dst(self): + # The test resamples a DatetimeIndex with values before and after a + # DST change + # Issue: 14682 + + # The DatetimeIndex we will start with + # (note that DST happens at 03:00+02:00 -> 02:00+01:00) + # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 + df1 = DataFrame([1477786980, 1477790580], columns=['ts']) + dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s') + .dt.tz_localize('UTC') + .dt.tz_convert('Europe/Madrid')) + + # The expected DatetimeIndex after resampling. + # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 + df2 = DataFrame([1477785600, 1477789200], columns=['ts']) + dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s') + .dt.tz_localize('UTC') + .dt.tz_convert('Europe/Madrid')) + df = DataFrame([5, 5], index=dti1) + + result = df.resample(rule='H').sum() + expected = DataFrame([5, 5], index=dti2) + + assert_frame_equal(result, expected) + + def test_resample_dst_anchor(self): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') df = DataFrame([5], index=dti) From 3443de7801188dd07da65eafcc2792acd241f99f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 22 Nov 2016 06:38:00 -0500 Subject: [PATCH 084/183] TST: Test aggregation over arrays (#3788) closes #3788 Author: Matt Roeschke Closes #14675 from mroeschke/fix_3788 and squashes the following commits: 01dce9d [Matt Roeschke] TST: Test aggregation over arrays (#3788) --- pandas/tests/test_groupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index dc326aeaa88ac..52d1c5c3681e0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6813,6 +6813,23 @@ def test_group_shift_with_null_key(self): assert_frame_equal(result, expected) + def test_agg_over_numpy_arrays(self): + # GH 3788 + df = pd.DataFrame([[1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])]], + columns=['category', 'arraydata']) + result = df.groupby('category').agg(sum) + + expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] + expected_index = pd.Index([1, 2], name='category') + expected_column = ['arraydata'] + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_column) + + assert_frame_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From fdb70a96716c07a0940312cabffb61aea5d6d997 Mon Sep 17 00:00:00 2001 From: James Draper Date: Tue, 22 Nov 2016 11:39:15 -0500 Subject: [PATCH 085/183] DOC: update FAQ to note pandas-qt only works for python 2.x (#14713) --- doc/source/faq.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/faq.rst b/doc/source/faq.rst index d23e0ca59254d..3828ee1f9d091 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -111,5 +111,4 @@ Visualizing Data in Qt applications ----------------------------------- There is no support for such visualization in pandas. However, the external -package `pandas-qt `_ does -provide this functionality. +package `pandas-qt `_ provides this functionality for Python 2.x. From 880de3054c7ce3a6c48e295837207787dc73350f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 22 Nov 2016 18:34:07 -0500 Subject: [PATCH 086/183] TST: skip test_transactions in sqlitefallback on py3.6 --- pandas/io/tests/test_sql.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index e9d19bbd8be66..cb08944e8dc57 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -36,7 +36,7 @@ from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat -from pandas.compat import StringIO, range, lrange, string_types +from pandas.compat import StringIO, range, lrange, string_types, PY36 from pandas.tseries.tools import format as date_format import pandas.io.sql as sql @@ -2001,6 +2001,8 @@ def test_to_sql_save_index(self): self._to_sql_save_index() def test_transactions(self): + if PY36: + raise nose.SkipTest("not working on python > 3.5") self._transaction_test() def _get_sqlite_column_type(self, table, column): From ae72d3b99b2142326dfff7f865f431fc4b1f79d2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 23 Nov 2016 00:31:49 -0800 Subject: [PATCH 087/183] DOC: Disambiguate 'where' in boolean indexing-10min.rst (#12661) (#14708) Expand on boolean indexing example --- doc/source/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 54bcd76855f32..0612e86134cf2 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -282,7 +282,7 @@ Using a single column's values to select data. df[df.A > 0] -A ``where`` operation for getting. +Selecting values from a DataFrame where a boolean condition is met. .. ipython:: python From 4143b323f5f653c3f7168cef013f7b8ceeb5e27d Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Wed, 23 Nov 2016 08:35:09 +0000 Subject: [PATCH 088/183] DOC: fix typo in merge_asof docstring examples (#14718) --- pandas/tools/merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d2060185c3246..8d2f92ad58a88 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -371,7 +371,7 @@ def merge_asof(left, right, on=None, By default we are taking the asof of the quotes - >>> pd.asof_merge(trades, quotes, + >>> pd.merge_asof(trades, quotes, ... on='time', ... by='ticker') time ticker price quantity bid ask @@ -383,7 +383,7 @@ def merge_asof(left, right, on=None, We only asof within 2ms betwen the quote time and the trade time - >>> pd.asof_merge(trades, quotes, + >>> pd.merge_asof(trades, quotes, ... on='time', ... by='ticker', ... tolerance=pd.Timedelta('2ms')) @@ -398,7 +398,7 @@ def merge_asof(left, right, on=None, and we exclude exact matches on time. However *prior* data will propogate forward - >>> pd.asof_merge(trades, quotes, + >>> pd.merge_asof(trades, quotes, ... on='time', ... by='ticker', ... tolerance=pd.Timedelta('10ms'), From 73aa6957ad2db8b5d5d90c8c4a33ddac452419f1 Mon Sep 17 00:00:00 2001 From: Nolan Nichols Date: Wed, 23 Nov 2016 12:35:31 -0800 Subject: [PATCH 089/183] doc: comverted --> converted (#14722) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 05148c1f7e80a..f704a61042b4f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1346,7 +1346,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, file quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` - then floats are comverted to strings and thus csv.QUOTE_NONNUMERIC + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC will treat them as non-numeric quotechar : string (length 1), default '\"' character used to quote fields From e4413c425e1fafe0b92b3b04dc49efb57d21af0b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 Nov 2016 15:39:25 -0500 Subject: [PATCH 090/183] DEPR: Patch to_dense behaviour for sparse. Patches the following for `to_dense`: 1) Fix `SparseArray.to_dense` documentation to refer to `SparseArray` and not `SparseSeries`. 2) Deprecate the fill parameter in `SparseArray.to_dense`, as that parameter was not being respected. 3) Deprecate the sparse_only parameter in `SparseSeries.to_dense`, as that parameter is inconsistent with the `to_dense` API we want, which is no parameters. Closes #14647. Author: gfyoung Closes #14686 from gfyoung/to-dense-patch and squashes the following commits: ad7da32 [gfyoung] BUG: Patch to_dense behaviour for sparse. --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/sparse/array.py | 17 ++++++++++++++++- pandas/sparse/series.py | 17 ++++++++++++++++- pandas/sparse/tests/test_array.py | 5 +++++ pandas/sparse/tests/test_series.py | 5 ++++- 5 files changed, 43 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 581106924c77e..03e0cae6cc83f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -64,6 +64,8 @@ Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ``pd.to_datetime`` and ``pd.to_timedelta`` have dropped the ``coerce`` parameter in favor of ``errors`` (:issue:`13602`) +- ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`) +- ``SparseSeries.to_dense()`` has deprecated the ``sparse_only`` parameter (:issue:`14647`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 8420371d05e02..a15def65cad7e 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -5,6 +5,7 @@ # pylint: disable=E1101,E1103,W0231 import numpy as np +import warnings import pandas as pd from pandas.core.base import PandasObject @@ -381,8 +382,22 @@ def get_values(self, fill=None): def to_dense(self, fill=None): """ - Convert SparseSeries to (dense) Series + Convert SparseArray to a NumPy array. + + Parameters + ---------- + fill: float, default None + DEPRECATED: this argument will be removed in a future version + because it is not respected by this function. + + Returns + ------- + arr : NumPy array """ + if fill is not None: + warnings.warn(("The 'fill' parameter has been deprecated and " + "will be removed in a future version."), + FutureWarning, stacklevel=2) return self.values def __iter__(self): diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index ad9168890b8f2..660f76ff1001d 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -528,9 +528,24 @@ def _set_values(self, key, value): def to_dense(self, sparse_only=False): """ - Convert SparseSeries to (dense) Series + Convert SparseSeries to a Series. + + Parameters + ---------- + sparse_only: bool, default False + DEPRECATED: this argument will be removed in a future version. + + If True, return just the non-sparse values, or the dense version + of `self.values` if False. + + Returns + ------- + s : Series """ if sparse_only: + warnings.warn(("The 'sparse_only' parameter has been deprecated " + "and will be removed in a future version."), + FutureWarning, stacklevel=2) int_index = self.sp_index.to_int_index() index = self.index.take(int_index.indices) return Series(self.sp_values, index=index, name=self.name) diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 2b284ac631d3f..1c9b6119cf665 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -453,6 +453,11 @@ def test_to_dense(self): res = SparseArray(vals, fill_value=0).to_dense() tm.assert_numpy_array_equal(res, vals) + # see gh-14647 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + SparseArray(vals).to_dense(fill=2) + def test_getitem(self): def _checkit(i): assert_almost_equal(self.arr[i], self.arr.values[i]) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index de8c63df9c9e6..116596e36b402 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -161,7 +161,10 @@ def test_sparse_to_dense(self): series = self.bseries.to_dense() tm.assert_series_equal(series, Series(arr, name='bseries')) - series = self.bseries.to_dense(sparse_only=True) + # see gh-14647 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + series = self.bseries.to_dense(sparse_only=True) indexer = np.isfinite(arr) exp = Series(arr[indexer], index=index[indexer], name='bseries') From 22d982a8afdef3c438c9c93dfe5299cc5ca07de2 Mon Sep 17 00:00:00 2001 From: Dimitris Spathis Date: Wed, 23 Nov 2016 22:23:32 +0100 Subject: [PATCH 091/183] Update frame.py (#14724) typo "explicitly" --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f704a61042b4f..bf1ff28cd63b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3233,7 +3233,7 @@ def trans(v): # try to be helpful if isinstance(self.columns, MultiIndex): raise ValueError('Cannot sort by column %s in a ' - 'multi-index you need to explicity ' + 'multi-index you need to explicitly ' 'provide all the levels' % str(by)) raise ValueError('Cannot sort by duplicate column %s' % From 75b606abad51762b3faace1bbfa0a8d2241dc297 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 24 Nov 2016 16:18:20 -0500 Subject: [PATCH 092/183] BUG: Respect the dtype parameter for empty CSV (#14717) --- doc/source/whatsnew/v0.19.2.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 25 +++++++++----- pandas/io/tests/parser/c_parser_only.py | 46 +++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 5a255d1e62043..49c8330490ed1 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`) +- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 03e0cae6cc83f..65b62601c7022 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -83,3 +83,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3fe5e5e826ebd..929b360854d5b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,6 +20,7 @@ is_float, is_scalar) from pandas.core.index import Index, MultiIndex, RangeIndex +from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.common import AbstractMethodError from pandas.core.config import get_option @@ -2791,19 +2792,27 @@ def _clean_index_names(columns, index_col): def _get_empty_meta(columns, index_col, index_names, dtype=None): columns = list(columns) - if dtype is None: - dtype = {} + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not isinstance(dtype, dict): + # if dtype == None, default will be np.object. + default_dtype = dtype or np.object + dtype = defaultdict(lambda: default_dtype) else: - if not isinstance(dtype, dict): - dtype = defaultdict(lambda: dtype) + # Save a copy of the dictionary. + _dtype = dtype.copy() + dtype = defaultdict(lambda: np.object) + # Convert column indexes to column names. - dtype = dict((columns[k] if is_integer(k) else k, v) - for k, v in compat.iteritems(dtype)) + for k, v in compat.iteritems(_dtype): + col = columns[k] if is_integer(k) else k + dtype[col] = v if index_col is None or index_col is False: index = Index([]) else: - index = [np.empty(0, dtype=dtype.get(index_name, np.object)) + index = [Series([], dtype=dtype[index_name]) for index_name in index_names] index = MultiIndex.from_arrays(index, names=index_names) index_col.sort() @@ -2811,7 +2820,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): columns.pop(n - i) col_dict = dict((col_name, - np.empty(0, dtype=dtype.get(col_name, np.object))) + Series([], dtype=dtype[col_name])) for col_name in columns) return index, columns, col_dict diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 75b99654dbf89..9cbe88d4032a3 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -561,3 +561,49 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) + + def test_empty_dtype(self): + # see gh-14712 + data = 'a,b' + + expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) + result = self.read_csv(StringIO(data), header=0, dtype=np.float64) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Categorical([]), + 'b': pd.Categorical([])}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='category') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + result = self.read_csv(StringIO(data), header=0, + dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), + 'b': pd.Series([], dtype='timedelta64[ns]')}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='timedelta64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={0: np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + expected['b'] = expected['b'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.int32, 1: np.float64}) + tm.assert_frame_equal(result, expected) From 6ad6e4e1d9251a9fddcbed80bdaad18ed07c66ae Mon Sep 17 00:00:00 2001 From: themrmax Date: Fri, 25 Nov 2016 21:05:30 +1100 Subject: [PATCH 093/183] DOC: Correct uniqueness of index for Series (#14344) --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 105e39562f561..44d1703fb9b8a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -103,11 +103,11 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin, """ One-dimensional ndarray with axis labels (including time series). - Labels need not be unique but must be any hashable type. The object + Labels need not be unique but must be a hashable type. The object supports both integer- and label-based indexing and provides a host of methods for performing operations involving the index. Statistical methods from ndarray have been overridden to automatically exclude - missing data (currently represented as NaN) + missing data (currently represented as NaN). Operations between Series (+, -, /, *, **) align values based on their associated index values-- they need not be the same length. The result @@ -118,8 +118,8 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin, data : array-like, dict, or scalar value Contains data stored in Series index : array-like or Index (1d) - Values must be unique and hashable, same length as data. Index - object (or other iterable of same length as data) Will default to + Values must be hashable and have the same length as `data`. + Non-unique index values are allowed. Will default to RangeIndex(len(data)) if not provided. If both a dict and index sequence are used, the index will override the keys found in the dict. From 26a72e115dd8450e420fcc2fa895aab28d899da0 Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Fri, 25 Nov 2016 14:52:11 +0200 Subject: [PATCH 094/183] CLN: move assignment from header into cython (#14731) --- pandas/src/datetime.pxd | 3 --- pandas/src/datetime_helper.h | 5 ----- pandas/tslib.pyx | 3 ++- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd index d3d471a33715d..2267c8282ec14 100644 --- a/pandas/src/datetime.pxd +++ b/pandas/src/datetime.pxd @@ -42,9 +42,6 @@ cdef extern from "datetime.h": object PyDateTime_FromDateAndTime(int year, int month, int day, int hour, int minute, int second, int us) -cdef extern from "datetime_helper.h": - void mangle_nat(object o) - cdef extern from "numpy/ndarrayobject.h": ctypedef int64_t npy_timedelta diff --git a/pandas/src/datetime_helper.h b/pandas/src/datetime_helper.h index d78e91e747854..11399181fa4e7 100644 --- a/pandas/src/datetime_helper.h +++ b/pandas/src/datetime_helper.h @@ -7,11 +7,6 @@ #define PyInt_AS_LONG PyLong_AsLong #endif -void mangle_nat(PyObject *val) { - PyDateTime_GET_MONTH(val) = -1; - PyDateTime_GET_DAY(val) = -1; -} - npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index acc0e45562cf2..649aa22e5e3ae 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -803,7 +803,8 @@ class NaTType(_NaT): cdef _NaT base base = _NaT.__new__(cls, 1, 1, 1) - mangle_nat(base) + base._day = -1 + base._month = -1 base.value = NPY_NAT return base From ee108164ee7a3746956d01558e0210b22452fd0f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Nov 2016 08:18:43 -0500 Subject: [PATCH 095/183] MAINT: Ignore .pxi files All `.pxi` files have the warning to not edit directly, so why not have Git ignore any changes made there? In addition, this PR deletes all of the `.pxi` files from the codebase because they are automatically generated by the `.pxi.in` files Author: gfyoung Closes #14723 from gfyoung/gitignore-pxi-ignore and squashes the following commits: fe0c410 [gfyoung] MAINT: Ignore .pxi files --- .gitignore | 1 + pandas/src/algos_common_helper.pxi | 2764 ------------ pandas/src/algos_groupby_helper.pxi | 1375 ------ pandas/src/algos_take_helper.pxi | 4949 --------------------- pandas/src/hashtable_class_helper.pxi | 860 ---- pandas/src/hashtable_func_helper.pxi | 197 - pandas/src/join_helper.pxi | 1899 -------- pandas/src/joins_func_helper.pxi | 373 -- pandas/src/sparse_op_helper.pxi | 5864 ------------------------- 9 files changed, 1 insertion(+), 18281 deletions(-) delete mode 100644 pandas/src/algos_common_helper.pxi delete mode 100644 pandas/src/algos_groupby_helper.pxi delete mode 100644 pandas/src/algos_take_helper.pxi delete mode 100644 pandas/src/hashtable_class_helper.pxi delete mode 100644 pandas/src/hashtable_func_helper.pxi delete mode 100644 pandas/src/join_helper.pxi delete mode 100644 pandas/src/joins_func_helper.pxi delete mode 100644 pandas/src/sparse_op_helper.pxi diff --git a/.gitignore b/.gitignore index 19f1cc804dca0..a77e780f3332d 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ *.class *.dll *.exe +*.pxi *.o *.py[ocd] *.so diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi deleted file mode 100644 index 9dede87e0c15b..0000000000000 --- a/pandas/src/algos_common_helper.pxi +++ /dev/null @@ -1,2764 +0,0 @@ -""" -Template for each `dtype` helper function using 1-d template - -# 1-d template -- map_indices -- pad -- pad_1d -- pad_2d -- backfill -- backfill_1d -- backfill_2d -- is_monotonic -- arrmap - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# 1-d template -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float64(ndarray[float64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - float64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float64(ndarray[float64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float32(ndarray[float32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - float32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float32(ndarray[float32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_object(ndarray[object] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - object prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_object(ndarray[object] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int32(ndarray[int32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - int32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int32(ndarray[int32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int64(ndarray[int64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - int64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int64(ndarray[int64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_bool(ndarray[uint8_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1, -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1, -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec, is_unique - """ - cdef: - Py_ssize_t i, n - uint8_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - bint is_unique = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False, True - else: - return True, True, True - elif n < 2: - return True, True, True - - if timelike and arr[0] == iNaT: - return False, False, True - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec, \ - is_unique and (is_monotonic_inc or is_monotonic_dec) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_bool(ndarray[uint8_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -#---------------------------------------------------------------------- -# put template -#---------------------------------------------------------------------- - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float64(ndarray[float64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_float64_float64(ndarray[float64_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float64_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float32(ndarray[float32_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_float32_float32(ndarray[float32_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float32_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int8(ndarray[int8_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_int8_float32(ndarray[int8_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float32_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int16(ndarray[int16_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_int16_float32(ndarray[int16_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float32_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int32(ndarray[int32_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_int32_float64(ndarray[int32_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float64_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int64(ndarray[int64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[float64_t] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] - -#---------------------------------------------------------------------- -# ensure_dtype -#---------------------------------------------------------------------- - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num - -cpdef ensure_platform_int(object arr): - # GH3033, GH1392 - # platform int is the size of the int pointer, e.g. np.intp - if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: - return arr - else: - return arr.astype(np.intp) - else: - return np.array(arr, dtype=np.intp) - -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - elif hasattr(arr, 'asobject'): - return arr.asobject - else: - return np.array(arr, dtype=np.object_) - -cpdef ensure_float64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT64: - return arr - else: - return arr.astype(np.float64) - else: - return np.array(arr, dtype=np.float64) - -cpdef ensure_float32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT32: - return arr - else: - return arr.astype(np.float32) - else: - return np.array(arr, dtype=np.float32) - -cpdef ensure_int8(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT8: - return arr - else: - return arr.astype(np.int8) - else: - return np.array(arr, dtype=np.int8) - -cpdef ensure_int16(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT16: - return arr - else: - return arr.astype(np.int16) - else: - return np.array(arr, dtype=np.int16) - -cpdef ensure_int32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT32: - return arr - else: - return arr.astype(np.int32) - else: - return np.array(arr, dtype=np.int32) - -cpdef ensure_int64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT64: - return arr - else: - return arr.astype(np.int64) - else: - return np.array(arr, dtype=np.int64) diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi deleted file mode 100644 index 013a03f719bbd..0000000000000 --- a/pandas/src/algos_groupby_helper.pxi +++ /dev/null @@ -1,1375 +0,0 @@ -""" -Template for each `dtype` helper function using groupby - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" -_int64_max = np.iinfo(np.int64).max - -#---------------------------------------------------------------------- -# group_add, group_prod, group_var, group_mean, group_ohlc -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, ct, oldmean - ndarray[float64_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) -# add passing bin edges, instead of labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, ct, oldmean - ndarray[float32_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) -# add passing bin edges, instead of labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float32_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - -#---------------------------------------------------------------------- -# group_nth, group_last -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - -#---------------------------------------------------------------------- -# group_min, group_max -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-_int64_max) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = maxx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(_int64_max) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = minx[i, j] - -#---------------------------------------------------------------------- -# other grouping functions not needing a template -#---------------------------------------------------------------------- - - -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:, :] out, - float64_t[:, :] values, - int64_t[:] labels, - float64_t[:, :] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:, :] out, - numeric[:, :] values, - int64_t[:] labels, - numeric[:, :] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] += val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:, :] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - - # Skip null keys - if lab == -1: - out[ii] = -1 - continue - - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi deleted file mode 100644 index d8fb05804d4e5..0000000000000 --- a/pandas/src/algos_take_helper.pxi +++ /dev/null @@ -1,4949 +0,0 @@ -""" -Template for each `dtype` helper function for take - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# take_1d, take_2d -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_bool_memview(uint8_t[:] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[uint8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - uint8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_object_memview(uint8_t[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_object(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = True if values[idx, idx1[j]] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int8_memview(int8_t[:] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int8(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int32_memview(int8_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int32(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int64_memview(int8_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_float64_memview(int8_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_float64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int16_memview(int16_t[:] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int16(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int16_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int16_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int32_memview(int16_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int32(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int64_memview(int16_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_float64_memview(int16_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_float64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int32_memview(int32_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int32(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int64_memview(int32_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_float64_memview(int32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_float64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_int64_memview(int64_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_int64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_float64_memview(int64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_float64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float32_memview(float32_t[:] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float32(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float64_memview(float32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float64(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float64_float64_memview(float64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float64_float64(ndarray[float64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_object_object_memview(object[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - - - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_object_object(ndarray[object, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_object_object(ndarray[object, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] diff --git a/pandas/src/hashtable_class_helper.pxi b/pandas/src/hashtable_class_helper.pxi deleted file mode 100644 index da0c76aeca86f..0000000000000 --- a/pandas/src/hashtable_class_helper.pxi +++ /dev/null @@ -1,860 +0,0 @@ -""" -Template for each `dtype` helper function for hashtable - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# VectorData -#---------------------------------------------------------------------- - - -ctypedef struct Float64VectorData: - float64_t *data - size_t n, m - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void append_data_float64(Float64VectorData *data, - float64_t x) nogil: - - data.data[data.n] = x - data.n += 1 - - -ctypedef struct Int64VectorData: - int64_t *data - size_t n, m - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void append_data_int64(Int64VectorData *data, - int64_t x) nogil: - - data.data[data.n] = x - data.n += 1 - -ctypedef fused vector_data: - Int64VectorData - Float64VectorData - -cdef bint needs_resize(vector_data *data) nogil: - return data.n == data.m - -#---------------------------------------------------------------------- -# Vector -#---------------------------------------------------------------------- - -cdef class Float64Vector: - - cdef: - Float64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc( - sizeof(Float64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.float64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, float64_t x): - - if needs_resize(self.data): - self.resize() - - append_data_float64(self.data, x) - -cdef class Int64Vector: - - cdef: - Int64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc( - sizeof(Int64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.int64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, int64_t x): - - if needs_resize(self.data): - self.resize() - - append_data_int64(self.data, x) - - -cdef class ObjectVector: - - cdef: - PyObject **data - size_t n, m - ndarray ao - - def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP - self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data - - def __len__(self): - return self.n - - cdef inline append(self, object o): - if self.n == self.m: - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) - self.data = self.ao.data - - Py_INCREF(o) - self.data[self.n] = o - self.n += 1 - - def to_array(self): - self.ao.resize(self.n) - self.m = self.n - return self.ao - - -#---------------------------------------------------------------------- -# HashTable -#---------------------------------------------------------------------- - - -cdef class HashTable: - pass - -cdef class Float64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_float64() - if size_hint is not None: - kh_resize_float64(self.table, size_hint) - - def __len__(self): - return self.table.size - - def __dealloc__(self): - kh_destroy_float64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_float64(self.table, key) - return k != self.table.n_buckets - - cpdef get_item(self, float64_t val): - cdef khiter_t k - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, float64_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, float64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_float64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_float64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - @cython.boundscheck(False) - def map(self, float64_t[:] keys, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t key - khiter_t k - - with nogil: - for i in range(n): - key = keys[i] - k = kh_put_float64(self.table, key, &ret) - self.table.vals[k] = values[i] - - @cython.boundscheck(False) - def map_locations(self, ndarray[float64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def factorize(self, float64_t values): - uniques = Float64Vector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, float64_t[:] values, Float64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - float64_t val - khiter_t k - Float64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if check_null and val != val: - labels[i] = na_sentinel - continue - - k = kh_get_float64(self.table, val) - - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data_float64(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def get_labels_groupby(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = 0 - int ret = 0 - float64_t val - khiter_t k - Float64Vector uniques = Float64Vector() - Float64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - # specific for groupby - if val < 0: - labels[i] = -1 - continue - - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data_float64(ud, val) - labels[i] = count - count += 1 - - arr_uniques = uniques.to_array() - - return np.asarray(labels), arr_uniques - - @cython.boundscheck(False) - def unique(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - bint seen_na = 0 - Float64Vector uniques = Float64Vector() - Float64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(self.table, val) - if k == self.table.n_buckets: - kh_put_float64(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_float64(ud, val) - elif not seen_na: - seen_na = 1 - if needs_resize(ud): - with gil: - uniques.resize() - append_data_float64(ud, NAN) - - return uniques.to_array() - -cdef class Int64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_int64() - if size_hint is not None: - kh_resize_int64(self.table, size_hint) - - def __len__(self): - return self.table.size - - def __dealloc__(self): - kh_destroy_int64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_int64(self.table, key) - return k != self.table.n_buckets - - cpdef get_item(self, int64_t val): - cdef khiter_t k - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, int64_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, int64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_int64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_int64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - @cython.boundscheck(False) - def map(self, int64_t[:] keys, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t key - khiter_t k - - with nogil: - for i in range(n): - key = keys[i] - k = kh_put_int64(self.table, key, &ret) - self.table.vals[k] = values[i] - - @cython.boundscheck(False) - def map_locations(self, ndarray[int64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def factorize(self, int64_t values): - uniques = Int64Vector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, int64_t[:] values, Int64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - int64_t val - khiter_t k - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if check_null and val == iNaT: - labels[i] = na_sentinel - continue - - k = kh_get_int64(self.table, val) - - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data_int64(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def get_labels_groupby(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = 0 - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - # specific for groupby - if val < 0: - labels[i] = -1 - continue - - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data_int64(ud, val) - labels[i] = count - count += 1 - - arr_uniques = uniques.to_array() - - return np.asarray(labels), arr_uniques - - @cython.boundscheck(False) - def unique(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - bint seen_na = 0 - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - k = kh_get_int64(self.table, val) - if k == self.table.n_buckets: - kh_put_int64(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_int64(ud, val) - - return uniques.to_array() - - -cdef class StringHashTable(HashTable): - cdef kh_str_t *table - - def __cinit__(self, int size_hint=1): - self.table = kh_init_str() - if size_hint is not None: - kh_resize_str(self.table, size_hint) - - def __dealloc__(self): - kh_destroy_str(self.table) - - cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - buf = util.get_c_string(key) - - k = kh_put_str(self.table, buf, &ret) - self.table.keys[k] = key - if kh_exist_str(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def get_indexer(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf - int64_t *resbuf = labels.data - khiter_t k - kh_str_t *table = self.table - - for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 - return labels - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - char *buf - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) - - return uniques.to_array() - - def factorize(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 - int ret = 0 - object val - char *buf - khiter_t k - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 - - return reverse, labels - - -na_sentinel = object - -cdef class PyObjectHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_pymap() - kh_resize_pymap(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - self.destroy() - - def __len__(self): - return self.table.size - - def __contains__(self, object key): - cdef khiter_t k - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_get_pymap(self.table, key) - return k != self.table.n_buckets - - def destroy(self): - kh_destroy_pymap(self.table) - self.table = NULL - - cpdef get_item(self, object val): - cdef khiter_t k - if val != val or val is None: - val = na_sentinel - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key - if kh_exist_pymap(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def map_locations(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = i - - def lookup(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - bint seen_na = 0 - - for i in range(n): - val = values[i] - hash(val) - if not _checknan(val): - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - elif not seen_na: - seen_na = 1 - uniques.append(nan) - - return uniques.to_array() - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - khiter_t k - - labels = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - - if check_null and val != val or val is None: - labels[i] = na_sentinel - continue - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count - uniques.append(val) - labels[i] = count - count += 1 - - return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_func_helper.pxi b/pandas/src/hashtable_func_helper.pxi deleted file mode 100644 index d05b81acc5dd5..0000000000000 --- a/pandas/src/hashtable_func_helper.pxi +++ /dev/null @@ -1,197 +0,0 @@ -""" -Template for each `dtype` helper function for hashtable - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# VectorData -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef build_count_table_float64(float64_t[:] values, - kh_float64_t *table, bint dropna): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - float64_t val - int ret = 0 - - with nogil: - kh_resize_float64(table, n) - - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_float64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_float64(table, val, &ret) - table.vals[k] = 1 - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef value_count_float64(float64_t[:] values, bint dropna): - cdef: - Py_ssize_t i=0 - kh_float64_t *table - float64_t[:] result_keys - int64_t[:] result_counts - int k - - table = kh_init_float64() - build_count_table_float64(values, table, dropna) - - result_keys = np.empty(table.n_occupied, dtype=np.float64) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - - with nogil: - for k in range(table.n_buckets): - if kh_exist_float64(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_float64(table) - - return np.asarray(result_keys), np.asarray(result_counts) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_float64(float64_t[:] values, - object keep='first'): - cdef: - int ret = 0, k - float64_t value - Py_ssize_t i, n = len(values) - kh_float64_t * table = kh_init_float64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_float64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_float64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_float64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_float64(table) - return out - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef build_count_table_int64(int64_t[:] values, - kh_int64_t *table, bint dropna): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - int64_t val - int ret = 0 - - with nogil: - kh_resize_int64(table, n) - - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 - - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef value_count_int64(int64_t[:] values, bint dropna): - cdef: - Py_ssize_t i=0 - kh_int64_t *table - int64_t[:] result_keys - int64_t[:] result_counts - int k - - table = kh_init_int64() - build_count_table_int64(values, table, dropna) - - result_keys = np.empty(table.n_occupied, dtype=np.int64) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - - with nogil: - for k in range(table.n_buckets): - if kh_exist_int64(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_int64(table) - - return np.asarray(result_keys), np.asarray(result_counts) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def duplicated_int64(int64_t[:] values, - object keep='first'): - cdef: - int ret = 0, k - int64_t value - Py_ssize_t i, n = len(values) - kh_int64_t * table = kh_init_int64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) - - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') - - if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 - else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_int64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_int64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_int64(table) - return out diff --git a/pandas/src/join_helper.pxi b/pandas/src/join_helper.pxi deleted file mode 100644 index 44b8159351492..0000000000000 --- a/pandas/src/join_helper.pxi +++ /dev/null @@ -1,1899 +0,0 @@ -""" -Template for each `dtype` helper function for join - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# left_join_indexer, inner_join_indexer, outer_join_indexer -#---------------------------------------------------------------------- - -# Joins on ordered, unique indices - -# right might contain non-unique values - - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -# Joins on ordered, unique indices - -# right might contain non-unique values - - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -# Joins on ordered, unique indices - -# right might contain non-unique values - - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - object lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -# Joins on ordered, unique indices - -# right might contain non-unique values - - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -# Joins on ordered, unique indices - -# right might contain non-unique values - - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer diff --git a/pandas/src/joins_func_helper.pxi b/pandas/src/joins_func_helper.pxi deleted file mode 100644 index 7a59da37c5ced..0000000000000 --- a/pandas/src/joins_func_helper.pxi +++ /dev/null @@ -1,373 +0,0 @@ -""" -Template for each `dtype` helper function for hashtable - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# asof_join_by -#---------------------------------------------------------------------- - - -from hashtable cimport * - - -def asof_join_int64_t_by_object(ndarray[int64_t] left_values, - ndarray[int64_t] right_values, - ndarray[object] left_by_values, - ndarray[object] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - int64_t tolerance_ - PyObjectHashTable hash_table - object by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = PyObjectHashTable(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = hash_table.get_item(by_value)\ - if by_value in hash_table else -1 - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_double_by_object(ndarray[double] left_values, - ndarray[double] right_values, - ndarray[object] left_by_values, - ndarray[object] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - double tolerance_ - PyObjectHashTable hash_table - object by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = PyObjectHashTable(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = hash_table.get_item(by_value)\ - if by_value in hash_table else -1 - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_int64_t_by_int64_t(ndarray[int64_t] left_values, - ndarray[int64_t] right_values, - ndarray[int64_t] left_by_values, - ndarray[int64_t] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - int64_t tolerance_ - Int64HashTable hash_table - int64_t by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = Int64HashTable(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = hash_table.get_item(by_value)\ - if by_value in hash_table else -1 - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_double_by_int64_t(ndarray[double] left_values, - ndarray[double] right_values, - ndarray[int64_t] left_by_values, - ndarray[int64_t] right_by_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - double tolerance_ - Int64HashTable hash_table - int64_t by_value - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - hash_table = Int64HashTable(right_size) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - hash_table.set_item(right_by_values[right_pos], right_pos) - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = hash_table.get_item(by_value)\ - if by_value in hash_table else -1 - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = found_right_pos - - # if needed, verify that tolerance is met - if has_tolerance and found_right_pos != -1: - diff = left_values[left_pos] - right_values[found_right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -#---------------------------------------------------------------------- -# asof_join -#---------------------------------------------------------------------- - - -def asof_join_int64_t(ndarray[int64_t] left_values, - ndarray[int64_t] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - int64_t tolerance_ - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer - - -def asof_join_double(ndarray[double] left_values, - ndarray[double] right_values, - bint allow_exact_matches=1, - tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 - double tolerance_ - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = 1 - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's value - if allow_exact_matches: - while right_pos < right_size and\ - right_values[right_pos] <= left_values[left_pos]: - right_pos += 1 - else: - while right_pos < right_size and\ - right_values[right_pos] < left_values[left_pos]: - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi deleted file mode 100644 index 8462c31c84679..0000000000000 --- a/pandas/src/sparse_op_helper.pxi +++ /dev/null @@ -1,5864 +0,0 @@ -""" -Template for each `dtype` helper function for sparse ops - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -#---------------------------------------------------------------------- -# Sparse op -#---------------------------------------------------------------------- - -cdef inline float64_t __div_float64(float64_t a, float64_t b): - if b == 0: - if a > 0: - return INF - elif a < 0: - return -INF - else: - return NaN - else: - return float(a) / b - -cdef inline float64_t __truediv_float64(float64_t a, float64_t b): - return __div_float64(a, b) - -cdef inline float64_t __floordiv_float64(float64_t a, float64_t b): - if b == 0: - # numpy >= 1.11 returns NaN - # for a // 0, rather than +-inf - if _np_version_under1p11: - if a > 0: - return INF - elif a < 0: - return -INF - return NaN - else: - return a // b - -cdef inline float64_t __mod_float64(float64_t a, float64_t b): - if b == 0: - return NaN - else: - return a % b - -cdef inline float64_t __div_int64(int64_t a, int64_t b): - if b == 0: - if a > 0: - return INF - elif a < 0: - return -INF - else: - return NaN - else: - return float(a) / b - -cdef inline float64_t __truediv_int64(int64_t a, int64_t b): - return __div_int64(a, b) - -cdef inline int64_t __floordiv_int64(int64_t a, int64_t b): - if b == 0: - return 0 - else: - return a // b - -cdef inline int64_t __mod_int64(int64_t a, int64_t b): - if b == 0: - return 0 - else: - return a % b - -#---------------------------------------------------------------------- -# sparse array op -#---------------------------------------------------------------------- - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_add_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] + y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill + yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_add_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] + y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - return out, out_index, xfill + yfill - - -cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_add_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_add_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_add_float64(float64_t xfill, - float64_t yfill): - return xfill + yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_add_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] + y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill + yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_add_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] + y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] + yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill + y[yi] - yi += 1 - - return out, out_index, xfill + yfill - - -cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_add_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_add_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_add_int64(int64_t xfill, - int64_t yfill): - return xfill + yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_sub_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] - y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill - yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_sub_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] - y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - return out, out_index, xfill - yfill - - -cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_sub_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_sub_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_sub_float64(float64_t xfill, - float64_t yfill): - return xfill - yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_sub_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] - y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill - yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_sub_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] - y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] - yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill - y[yi] - yi += 1 - - return out, out_index, xfill - yfill - - -cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_sub_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_sub_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_sub_int64(int64_t xfill, - int64_t yfill): - return xfill - yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_mul_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] * y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill * yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_mul_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] * y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - return out, out_index, xfill * yfill - - -cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_mul_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_mul_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_mul_float64(float64_t xfill, - float64_t yfill): - return xfill * yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_mul_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] * y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill * yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_mul_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] * y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] * yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill * y[yi] - yi += 1 - - return out, out_index, xfill * yfill - - -cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_mul_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_mul_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_mul_int64(int64_t xfill, - int64_t yfill): - return xfill * yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_div_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __div_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __div_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __div_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __div_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __div_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __div_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_div_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __div_float64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __div_float64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __div_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __div_float64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __div_float64(xfill, y[yi]) - yi += 1 - - return out, out_index, __div_float64(xfill, yfill) - - -cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_div_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_div_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_div_float64(float64_t xfill, - float64_t yfill): - return __div_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_div_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __div_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __div_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __div_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __div_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __div_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __div_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_div_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __div_int64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __div_int64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __div_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __div_int64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __div_int64(xfill, y[yi]) - yi += 1 - - return out, out_index, __div_int64(xfill, yfill) - - -cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_div_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_div_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_div_int64(int64_t xfill, - int64_t yfill): - return __div_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_mod_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __mod_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __mod_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __mod_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __mod_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __mod_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __mod_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_mod_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __mod_float64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __mod_float64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __mod_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __mod_float64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __mod_float64(xfill, y[yi]) - yi += 1 - - return out, out_index, __mod_float64(xfill, yfill) - - -cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_mod_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_mod_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_mod_float64(float64_t xfill, - float64_t yfill): - return __mod_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_mod_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __mod_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __mod_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __mod_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __mod_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __mod_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __mod_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_mod_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __mod_int64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __mod_int64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __mod_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __mod_int64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __mod_int64(xfill, y[yi]) - yi += 1 - - return out, out_index, __mod_int64(xfill, yfill) - - -cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_mod_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_mod_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_mod_int64(int64_t xfill, - int64_t yfill): - return __mod_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_truediv_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __truediv_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __truediv_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __truediv_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __truediv_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __truediv_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __truediv_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_truediv_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __truediv_float64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __truediv_float64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __truediv_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __truediv_float64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __truediv_float64(xfill, y[yi]) - yi += 1 - - return out, out_index, __truediv_float64(xfill, yfill) - - -cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_truediv_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_truediv_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_truediv_float64(float64_t xfill, - float64_t yfill): - return __truediv_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_truediv_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __truediv_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __truediv_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __truediv_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __truediv_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __truediv_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __truediv_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_truediv_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __truediv_int64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __truediv_int64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __truediv_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __truediv_int64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __truediv_int64(xfill, y[yi]) - yi += 1 - - return out, out_index, __truediv_int64(xfill, yfill) - - -cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_truediv_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_truediv_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_truediv_int64(int64_t xfill, - int64_t yfill): - return __truediv_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_floordiv_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __floordiv_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __floordiv_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __floordiv_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __floordiv_float64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __floordiv_float64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __floordiv_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_floordiv_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __floordiv_float64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __floordiv_float64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __floordiv_float64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __floordiv_float64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __floordiv_float64(xfill, y[yi]) - yi += 1 - - return out, out_index, __floordiv_float64(xfill, yfill) - - -cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_floordiv_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_floordiv_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_floordiv_float64(float64_t xfill, - float64_t yfill): - return __floordiv_float64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_floordiv_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = __floordiv_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = __floordiv_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __floordiv_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = __floordiv_int64(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = __floordiv_int64(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, __floordiv_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_floordiv_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = __floordiv_int64(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = __floordiv_int64(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = __floordiv_int64(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = __floordiv_int64(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = __floordiv_int64(xfill, y[yi]) - yi += 1 - - return out, out_index, __floordiv_int64(xfill, yfill) - - -cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_floordiv_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_floordiv_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_floordiv_int64(int64_t xfill, - int64_t yfill): - return __floordiv_int64(xfill, yfill) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_pow_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] ** y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill ** yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_pow_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] ** y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - return out, out_index, xfill ** yfill - - -cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_pow_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_pow_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_pow_float64(float64_t xfill, - float64_t yfill): - return xfill ** yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_pow_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] ** y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill ** yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_pow_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[int64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.int64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] ** y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] ** yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill ** y[yi] - yi += 1 - - return out, out_index, xfill ** yfill - - -cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_pow_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_pow_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_pow_int64(int64_t xfill, - int64_t yfill): - return xfill ** yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_eq_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] == y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill == yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_eq_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] == y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - return out, out_index, xfill == yfill - - -cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_eq_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_eq_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_eq_float64(float64_t xfill, - float64_t yfill): - return xfill == yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_eq_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] == y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill == yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_eq_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] == y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] == yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill == y[yi] - yi += 1 - - return out, out_index, xfill == yfill - - -cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_eq_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_eq_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_eq_int64(int64_t xfill, - int64_t yfill): - return xfill == yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_ne_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] != y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill != yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_ne_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] != y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - return out, out_index, xfill != yfill - - -cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_ne_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_ne_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_ne_float64(float64_t xfill, - float64_t yfill): - return xfill != yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_ne_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] != y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill != yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_ne_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] != y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] != yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill != y[yi] - yi += 1 - - return out, out_index, xfill != yfill - - -cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_ne_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_ne_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_ne_int64(int64_t xfill, - int64_t yfill): - return xfill != yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_lt_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] < y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill < yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_lt_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] < y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - return out, out_index, xfill < yfill - - -cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_lt_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_lt_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_lt_float64(float64_t xfill, - float64_t yfill): - return xfill < yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_lt_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] < y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill < yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_lt_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] < y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] < yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill < y[yi] - yi += 1 - - return out, out_index, xfill < yfill - - -cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_lt_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_lt_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_lt_int64(int64_t xfill, - int64_t yfill): - return xfill < yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_gt_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] > y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill > yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_gt_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] > y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - return out, out_index, xfill > yfill - - -cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_gt_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_gt_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_gt_float64(float64_t xfill, - float64_t yfill): - return xfill > yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_gt_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] > y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill > yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_gt_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] > y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] > yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill > y[yi] - yi += 1 - - return out, out_index, xfill > yfill - - -cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_gt_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_gt_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_gt_int64(int64_t xfill, - int64_t yfill): - return xfill > yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_le_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] <= y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill <= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_le_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] <= y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - return out, out_index, xfill <= yfill - - -cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_le_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_le_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_le_float64(float64_t xfill, - float64_t yfill): - return xfill <= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_le_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] <= y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill <= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_le_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] <= y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] <= yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill <= y[yi] - yi += 1 - - return out, out_index, xfill <= yfill - - -cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_le_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_le_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_le_int64(int64_t xfill, - int64_t yfill): - return xfill <= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_ge_float64(ndarray x_, - BlockIndex xindex, - float64_t xfill, - ndarray y_, - BlockIndex yindex, - float64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] >= y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill >= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_ge_float64(ndarray x_, IntIndex xindex, - float64_t xfill, - ndarray y_, IntIndex yindex, - float64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] >= y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - return out, out_index, xfill >= yfill - - -cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x, - SparseIndex xindex, float64_t xfill, - ndarray[float64_t, ndim=1] y, - SparseIndex yindex, float64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_ge_float64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_ge_float64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_ge_float64(float64_t xfill, - float64_t yfill): - return xfill >= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_ge_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] >= y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill >= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_ge_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] >= y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] >= yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill >= y[yi] - yi += 1 - - return out, out_index, xfill >= yfill - - -cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_ge_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_ge_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_ge_int64(int64_t xfill, - int64_t yfill): - return xfill >= yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_and_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] & y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill & yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_and_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] & y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - return out, out_index, xfill & yfill - - -cpdef sparse_and_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_and_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_and_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_and_int64(int64_t xfill, - int64_t yfill): - return xfill & yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_and_uint8(ndarray x_, - BlockIndex xindex, - uint8_t xfill, - ndarray y_, - BlockIndex yindex, - uint8_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[uint8_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] & y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill & yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_and_uint8(ndarray x_, IntIndex xindex, - uint8_t xfill, - ndarray y_, IntIndex yindex, - uint8_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[uint8_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] & y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] & yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill & y[yi] - yi += 1 - - return out, out_index, xfill & yfill - - -cpdef sparse_and_uint8(ndarray[uint8_t, ndim=1] x, - SparseIndex xindex, uint8_t xfill, - ndarray[uint8_t, ndim=1] y, - SparseIndex yindex, uint8_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_and_uint8(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_and_uint8(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_and_uint8(uint8_t xfill, - uint8_t yfill): - return xfill & yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_or_int64(ndarray x_, - BlockIndex xindex, - int64_t xfill, - ndarray y_, - BlockIndex yindex, - int64_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] | y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill | yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_or_int64(ndarray x_, IntIndex xindex, - int64_t xfill, - ndarray y_, IntIndex yindex, - int64_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[int64_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] | y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - return out, out_index, xfill | yfill - - -cpdef sparse_or_int64(ndarray[int64_t, ndim=1] x, - SparseIndex xindex, int64_t xfill, - ndarray[int64_t, ndim=1] y, - SparseIndex yindex, int64_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_or_int64(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_or_int64(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_or_int64(int64_t xfill, - int64_t yfill): - return xfill | yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple block_op_or_uint8(ndarray x_, - BlockIndex xindex, - uint8_t xfill, - ndarray y_, - BlockIndex yindex, - uint8_t yfill): - ''' - Binary operator on BlockIndex objects with fill values - ''' - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[uint8_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] | y[yi] - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index, xfill | yfill - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline tuple int_op_or_uint8(ndarray x_, IntIndex xindex, - uint8_t xfill, - ndarray y_, IntIndex yindex, - uint8_t yfill): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[uint8_t, ndim=1] x, y - ndarray[uint8_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.uint8) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = x[xi] | y[yi] - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = x[xi] | yfill - xi += 1 - else: - # use x fill value - out[out_i] = xfill | y[yi] - yi += 1 - - return out, out_index, xfill | yfill - - -cpdef sparse_or_uint8(ndarray[uint8_t, ndim=1] x, - SparseIndex xindex, uint8_t xfill, - ndarray[uint8_t, ndim=1] y, - SparseIndex yindex, uint8_t yfill): - - if isinstance(xindex, BlockIndex): - return block_op_or_uint8(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill) - elif isinstance(xindex, IntIndex): - return int_op_or_uint8(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill) - else: - raise NotImplementedError - - -cpdef sparse_fill_or_uint8(uint8_t xfill, - uint8_t yfill): - return xfill | yfill From b1d95990b4bad8e6dd555607e98dad690cfd59d6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 25 Nov 2016 12:00:49 -0500 Subject: [PATCH 096/183] DOC: missing ref in timeseries.rst (#14745) --- doc/source/timeseries.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 4132d25e9be48..854de443ac5ee 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1286,12 +1286,11 @@ secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. ``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. +See some :ref:`cookbook examples ` for some advanced strategies .. note:: - ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion `here ` - -See some :ref:`cookbook examples ` for some advanced strategies + ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion :ref:`here ` .. ipython:: python From d8e427bda07fc350dfa327df43b790697e733747 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Nov 2016 16:21:03 -0500 Subject: [PATCH 097/183] BUG: Improve error message for multi-char sep and quotes in Python engine (#14582) If there is a field counts mismatch, check whether a multi-char sep was used in conjunction with quotes. Currently, that setup is not respected and can result in improper line breaks. Closes gh-13374. --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/parsers.py | 5 +++++ pandas/io/tests/parser/python_parser_only.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 49c8330490ed1..d9aa92270669d 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -30,6 +30,7 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) +- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 929b360854d5b..1e32d7086ed5e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2515,6 +2515,11 @@ def _rows_to_cols(self, content): msg = ('Expected %d fields in line %d, saw %d' % (col_len, row_num + 1, zip_len)) + if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: + # see gh-13374 + reason = ('Error could possibly be due to quotes being ' + 'ignored when a multi-char delimiter is used.') + msg += '. ' + reason raise ValueError(msg) if self.usecols: diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index bbc1c3bab7635..55801b4a9788e 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -7,6 +7,7 @@ arguments when parsing. """ +import csv import sys import nose @@ -204,3 +205,19 @@ def test_encoding_non_utf8_multichar_sep(self): sep=sep, names=['a', 'b'], encoding=encoding) tm.assert_frame_equal(result, expected) + + def test_multi_char_sep_quotes(self): + # see gh-13374 + + data = 'a,,b\n1,,a\n2,,"2,,b"' + msg = 'ignored when a multi-char delimiter is used' + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), sep=',,') + + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), sep=',,', + quoting=csv.QUOTE_NONE) From 58731c44af7241d0c5c8f79538fea946e00aab29 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 25 Nov 2016 14:47:50 -0800 Subject: [PATCH 098/183] DOC: Remove outdated caveats for Anaconda and HTML parsing (#9032) (#14739) --- doc/source/gotchas.rst | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index cfac5c257184d..8a1e06fa6d86c 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -514,40 +514,6 @@ parse HTML tables in the top-level pandas io function ``read_html``. text from the URL over the web, i.e., IO (input-output). For very large tables, this might not be true. -**Issues with using** |Anaconda|_ - - * `Anaconda`_ ships with `lxml`_ version 3.2.0; the following workaround for - `Anaconda`_ was successfully used to deal with the versioning issues - surrounding `lxml`_ and `BeautifulSoup4`_. - - .. note:: - - Unless you have *both*: - - * A strong restriction on the upper bound of the runtime of some code - that incorporates :func:`~pandas.io.html.read_html` - * Complete knowledge that the HTML you will be parsing will be 100% - valid at all times - - then you should install `html5lib`_ and things will work swimmingly - without you having to muck around with `conda`. If you want the best of - both worlds then install both `html5lib`_ and `lxml`_. If you do install - `lxml`_ then you need to perform the following commands to ensure that - lxml will work correctly: - - .. code-block:: sh - - # remove the included version - conda remove lxml - - # install the latest version of lxml - pip install 'git+git://github.com/lxml/lxml.git' - - # install the latest version of beautifulsoup4 - pip install 'bzr+lp:beautifulsoup' - - Note that you need `bzr `__ and `git - `__ installed to perform the last two operations. .. |svm| replace:: **strictly valid markup** .. _svm: http://validator.w3.org/docs/help.html#validation_basics @@ -561,9 +527,6 @@ parse HTML tables in the top-level pandas io function ``read_html``. .. |lxml| replace:: **lxml** .. _lxml: http://lxml.de -.. |Anaconda| replace:: **Anaconda** -.. _Anaconda: https://store.continuum.io/cshop/anaconda - Byte-Ordering Issues -------------------- From 75bb5308dda18f5b0a6fc60b2cb837b8c115e011 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Sat, 26 Nov 2016 03:12:22 -0600 Subject: [PATCH 099/183] API: add dtype= option to python parser (#14295) --- doc/source/io.rst | 10 +- doc/source/whatsnew/v0.20.0.txt | 9 + pandas/io/parsers.py | 132 ++++++++-- pandas/io/tests/parser/c_parser_only.py | 242 +----------------- pandas/io/tests/parser/dtypes.py | 274 +++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 4 +- pandas/io/tests/parser/test_unsupported.py | 10 - pandas/parser.pyx | 59 +++-- 8 files changed, 435 insertions(+), 305 deletions(-) create mode 100644 pandas/io/tests/parser/dtypes.py diff --git a/doc/source/io.rst b/doc/source/io.rst index ee319092c6dd5..b1c151def26af 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` (unsupported with ``engine='python'``). Use `str` or `object` to preserve and not interpret dtype. + + .. versionadded:: 0.20.0 support for the Python parser. + engine : {``'c'``, ``'python'``} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -473,10 +476,9 @@ However, if you wanted for all the data to be coerced, no matter the type, then using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. -.. note:: - The ``dtype`` option is currently only supported by the C engine. - Specifying ``dtype`` with ``engine`` other than 'c' raises a - ``ValueError``. + .. versionadded:: 0.20.0 support for the Python parser. + + The ``dtype`` option is supported by the 'python' engine .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 65b62601c7022..6e3559bee728d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,17 @@ New features ~~~~~~~~~~~~ +``read_csv`` supports ``dtype`` keyword for python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. +.. ipython:: python + + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.enhancements.other: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1e32d7086ed5e..94eb015701004 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,11 +17,15 @@ zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, - is_float, - is_scalar) + is_float, is_dtype_equal, + is_object_dtype, + is_scalar, is_categorical_dtype) +from pandas.types.missing import isnull +from pandas.types.cast import _astype_nansafe from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -111,8 +115,9 @@ are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - (Unsupported with engine='python'). Use `str` or `object` to preserve and - not interpret dtype. + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds): 'true_values': None, 'false_values': None, 'converters': None, + 'dtype': None, 'skipfooter': 0, 'keep_default_na': True, @@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines': None, 'error_bad_lines': True, 'warn_bad_lines': True, - 'dtype': None, 'float_precision': None } @@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', - 'dtype', 'float_precision', ]) _deprecated_args = set([ @@ -834,9 +838,6 @@ def _clean_options(self, options, engine): " ignored as it is not supported by the 'python'" " engine.").format(reason=fallback_reason, option=arg) - if arg == 'dtype': - msg += " (Note the 'converters' option provides"\ - " similar functionality.)" raise ValueError(msg) del result[arg] @@ -1285,7 +1286,7 @@ def _agg_index(self, index, try_parse_dates=True): col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues) - arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_names) @@ -1293,10 +1294,15 @@ def _agg_index(self, index, try_parse_dates=True): return index def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None): + converters=None, dtypes=None): result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( @@ -1304,17 +1310,35 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - coerce_type = True if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(c), ParserWarning, + stacklevel=7) + try: values = lib.map_infer(values, conv_f) except ValueError: mask = lib.ismember(values, na_values).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) - coerce_type = False - cvals, na_count = self._convert_types( - values, set(col_na_values) | col_na_fvalues, coerce_type) + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool=False) + else: + # skip inference if specified dtype is object + try_num_bool = not (cast_type and is_object_dtype(cast_type)) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool) + + # type specificed in dtype param + if cast_type and not is_dtype_equal(cvals, cast_type): + cvals = self._cast_types(cvals, cast_type, c) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( @@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _convert_types(self, values, na_values, try_num_bool=True): + def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns: + -------- + converted : ndarray + na_count : int + """ + na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): if try_num_bool: try: result = lib.maybe_convert_numeric(values, na_values, False) + na_count = isnull(result).sum() except Exception: result = values if values.dtype == np.object_: @@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True): return result, na_count + def _cast_types(self, values, cast_type, column): + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + + if is_categorical_dtype(cast_type): + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + if not is_object_dtype(values): + values = _astype_nansafe(values, str) + values = Categorical(values) + else: + try: + values = _astype_nansafe(values, cast_type, copy=True) + except ValueError: + raise ValueError("Unable to convert column %s to " + "type %s" % (column, cast_type)) + return values + def _do_date_conversions(self, names, data): # returns data, columns if self.parse_dates is not None: @@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.dtype = kwds['dtype'] self.compact_ints = kwds['compact_ints'] self.use_unsigned = kwds['use_unsigned'] @@ -1982,7 +2056,7 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names) + names, self.index_col, self.index_names, self.dtype) columns = self._maybe_make_multi_index_columns( columns, self.col_names) return index, columns, col_dict @@ -2033,15 +2107,25 @@ def get_chunk(self, size=None): def _convert_data(self, data): # apply converters - clean_conv = {} - - for col, f in compat.iteritems(self.converters): - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] - clean_conv[col] = f + def _clean_mapping(mapping): + "converts col numbers to names" + clean = {} + for col, v in compat.iteritems(mapping): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv) + self.verbose, clean_conv, + clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9cbe88d4032a3..c781b0549ee60 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,10 +12,9 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, range, lrange -from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -100,29 +99,13 @@ def test_dtype_and_names_error(self): self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) - def test_passing_dtype(self): - # see gh-6607 + def test_unsupported_dtype(self): df = DataFrame(np.random.rand(5, 2), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + with tm.ensure_clean('__unsupported_dtype__.csv') as path: df.to_csv(path) - # see gh-3795: passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to - # convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - # valid but we don't support it (date) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, @@ -141,11 +124,6 @@ def test_passing_dtype(self): dtype={'A': 'U8'}, index_col=0) - # see gh-12048: empty frame - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - def test_precise_conversion(self): # see gh-8002 tm._skip_if_32bit() @@ -178,104 +156,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'object') - - def test_categorical_dtype(self): - # GH 10153 - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'a': 'category', - 'b': 'category', - 'c': CategoricalDtype()}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) - expected = pd.DataFrame({'a': [1, 1, 2], - 'b': Categorical(['a', 'a', 'b']), - 'c': [3.4, 3.4, 4.5]}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - # unsorted - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', 'b', 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - # missing - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', np.nan, 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_encoding(self): - # GH 10153 - pth = tm.get_data_path('unicode_series.csv') - encoding = 'latin-1' - expected = self.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - actual = self.read_csv(pth, header=None, encoding=encoding, - dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - pth = tm.get_data_path('utf16_ex.txt') - encoding = 'utf-16' - expected = self.read_table(pth, encoding=encoding) - expected = expected.apply(Categorical) - actual = self.read_table(pth, encoding=encoding, dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'])}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])}, - index=[2, 3])] - actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( @@ -295,66 +175,6 @@ def test_pass_dtype_as_recarray(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1') - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), - np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # see gh-9424 - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one.1', dtype='f')], axis=1) - - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_usecols_dtypes(self): data = """\ 1,2,3 @@ -400,16 +220,6 @@ def test_custom_lineterminator(self): tm.assert_frame_equal(result, expected) - def test_raise_on_passed_int_dtype_with_nas(self): - # see gh-2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - self.assertRaises(ValueError, self.read_csv, StringIO(data), - sep=",", skipinitialspace=True, - dtype={'DOY': np.int64}) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 @@ -561,49 +371,3 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) - - def test_empty_dtype(self): - # see gh-14712 - data = 'a,b' - - expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) - result = self.read_csv(StringIO(data), header=0, dtype=np.float64) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Categorical([]), - 'b': pd.Categorical([])}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='category') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') - result = self.read_csv(StringIO(data), header=0, - dtype='datetime64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), - 'b': pd.Series([], dtype='timedelta64[ns]')}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='timedelta64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={0: np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - expected['b'] = expected['b'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.int32, 1: np.float64}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py new file mode 100644 index 0000000000000..18c37b31f6480 --- /dev/null +++ b/pandas/io/tests/parser/dtypes.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- + +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas.compat import StringIO +from pandas.types.dtypes import CategoricalDtype +from pandas.io.common import ParserWarning + + +class DtypeTests(object): + def test_passing_dtype(self): + # see gh-6607 + df = DataFrame(np.random.rand(5, 2).round(4), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # see gh-3795: passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + expected = df.astype(str) + tm.assert_frame_equal(result, expected) + + # for parsing, interpret object as str + result = self.read_csv(path, dtype=object, index_col=0) + tm.assert_frame_equal(result, expected) + + # we expect all object columns, so need to + # convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result, df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, + index_col=0) + + # see gh-12048: empty frame + actual = self.read_csv(StringIO('A,B'), dtype=str) + expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) + tm.assert_frame_equal(actual, expected) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'object') + + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), + np.empty(0, dtype='O')], + names=['one', 'two']) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) + + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_raise_on_passed_int_dtype_with_nas(self): + # see gh-2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(ValueError, self.read_csv, StringIO(data), + sep=",", skipinitialspace=True, + dtype={'DOY': np.int64}) + + def test_dtype_with_converter(self): + data = """a,b +1.1,2.2 +1.2,2.3""" + # dtype spec ignored if converted specified + with tm.assert_produces_warning(ParserWarning): + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) + expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + def test_empty_dtype(self): + # see gh-14712 + data = 'a,b' + + expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) + result = self.read_csv(StringIO(data), header=0, dtype=np.float64) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Categorical([]), + 'b': pd.Categorical([])}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='category') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + result = self.read_csv(StringIO(data), header=0, + dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), + 'b': pd.Series([], dtype='timedelta64[ns]')}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='timedelta64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={0: np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + expected['b'] = expected['b'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.int32, 1: np.float64}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 6001c85ae76b1..6cca2e35e1135 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -22,6 +22,7 @@ from .compression import CompressionTests from .multithread import MultithreadTests from .python_parser_only import PythonParserTests +from .dtypes import DtypeTests class BaseParser(CommentTests, CompressionTests, @@ -29,7 +30,8 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests, QuotingTests): + UsecolsTests, QuotingTests, + DtypeTests): def read_csv(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 5d60c20854a83..ffd1cfa9a2538 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -44,16 +44,6 @@ def test_c_engine(self): data = 'a b c\n1 2 3' msg = 'does not support' - # specify C-unsupported options with python-unsupported option - # (options will be ignored on fallback, raise) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=r'\s', dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), skipfooter=1, dtype={'a': float}) - # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6b43dfbabc4a0..6760e822960f1 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -13,7 +13,7 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import ParserError, DtypeWarning, EmptyDataError +from io.common import ParserError, DtypeWarning, EmptyDataError, ParserWarning # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -987,7 +987,7 @@ cdef class TextReader: Py_ssize_t i, nused kh_str_t *na_hashset = NULL int start, end - object name, na_flist + object name, na_flist, col_dtype = None bint na_filter = 0 Py_ssize_t num_cols @@ -1043,14 +1043,34 @@ cdef class TextReader: else: na_filter = 0 + col_dtype = None + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + else: + if self.dtype.names: + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) + else: + col_dtype = self.dtype + if conv: + if col_dtype is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(name), ParserWarning, + stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) continue # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist) + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) @@ -1075,32 +1095,17 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_t *na_hashset, - object na_flist): - cdef: - object col_dtype = None - - if self.dtype is not None: - if isinstance(self.dtype, dict): - if name in self.dtype: - col_dtype = self.dtype[name] - elif i in self.dtype: - col_dtype = self.dtype[i] - else: - if self.dtype.names: - # structured array - col_dtype = np.dtype(self.dtype.descr[i][1]) - else: - col_dtype = self.dtype + object na_flist, object col_dtype): - if col_dtype is not None: - col_res, na_count = self._convert_with_dtype( - col_dtype, i, start, end, na_filter, - 1, na_hashset, na_flist) + if col_dtype is not None: + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # Fallback on the parse (e.g. we requested int dtype, - # but its actually a float). - if col_res is not None: - return col_res, na_count + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). + if col_res is not None: + return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) From 6d2b34af75ac28554cd6df933a7f502205ab7b35 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 26 Nov 2016 10:13:05 +0100 Subject: [PATCH 100/183] BUG: mixed freq timeseries plotting with shared axes (GH13341) (#14330) --- doc/source/whatsnew/v0.19.2.txt | 3 +- pandas/tests/plotting/test_datetimelike.py | 35 ++++++++++++++++++++++ pandas/tseries/plotting.py | 35 +++++++++++++++++----- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d9aa92270669d..74ee466d64a8e 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -39,7 +39,8 @@ Bug Fixes - Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`) - Bug in ``pd.to_numeric`` where a 0 was not unsigned on a ``downcast='unsigned'`` argument (:issue:`14401`) - +- Bug in plotting regular and irregular timeseries using shared axes + (``sharex=True`` or ``ax.twinx()``) (:issue:`13341`, :issue:`14322`). diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 0f7bc02e24915..f07aadba175f2 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -778,6 +778,41 @@ def test_mixed_freq_irreg_period(self): irreg.plot() ps.plot() + def test_mixed_freq_shared_ax(self): + + # GH13341, using sharex=True + idx1 = date_range('2015-01-01', periods=3, freq='M') + idx2 = idx1[:1].union(idx1[2:]) + s1 = Series(range(len(idx1)), idx1) + s2 = Series(range(len(idx2)), idx2) + + fig, (ax1, ax2) = self.plt.subplots(nrows=2, sharex=True) + s1.plot(ax=ax1) + s2.plot(ax=ax2) + + self.assertEqual(ax1.freq, 'M') + self.assertEqual(ax2.freq, 'M') + self.assertEqual(ax1.lines[0].get_xydata()[0, 0], + ax2.lines[0].get_xydata()[0, 0]) + + # using twinx + fig, ax1 = self.plt.subplots() + ax2 = ax1.twinx() + s1.plot(ax=ax1) + s2.plot(ax=ax2) + + self.assertEqual(ax1.lines[0].get_xydata()[0, 0], + ax2.lines[0].get_xydata()[0, 0]) + + # TODO (GH14330, GH14322) + # plotting the irregular first does not yet work + # fig, ax1 = plt.subplots() + # ax2 = ax1.twinx() + # s2.plot(ax=ax1) + # s1.plot(ax=ax2) + # self.assertEqual(ax1.lines[0].get_xydata()[0, 0], + # ax2.lines[0].get_xydata()[0, 0]) + @slow def test_to_weekly_resampling(self): idxh = date_range('1/1/1999', periods=52, freq='W') diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index fe64af67af0ed..89aecf2acc07e 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -162,18 +162,37 @@ def _decorate_axes(ax, freq, kwargs): ax.date_axis_info = None -def _get_freq(ax, series): - # get frequency from data - freq = getattr(series.index, 'freq', None) - if freq is None: - freq = getattr(series.index, 'inferred_freq', None) - +def _get_ax_freq(ax): + """ + Get the freq attribute of the ax object if set. + Also checks shared axes (eg when using secondary yaxis, sharex=True + or twinx) + """ ax_freq = getattr(ax, 'freq', None) if ax_freq is None: + # check for left/right ax in case of secondary yaxis if hasattr(ax, 'left_ax'): ax_freq = getattr(ax.left_ax, 'freq', None) elif hasattr(ax, 'right_ax'): ax_freq = getattr(ax.right_ax, 'freq', None) + if ax_freq is None: + # check if a shared ax (sharex/twinx) has already freq set + shared_axes = ax.get_shared_x_axes().get_siblings(ax) + if len(shared_axes) > 1: + for shared_ax in shared_axes: + ax_freq = getattr(shared_ax, 'freq', None) + if ax_freq is not None: + break + return ax_freq + + +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, 'freq', None) + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) + + ax_freq = _get_ax_freq(ax) # use axes freq if no data freq if freq is None: @@ -191,7 +210,7 @@ def _get_freq(ax, series): def _use_dynamic_x(ax, data): freq = _get_index_freq(data) - ax_freq = getattr(ax, 'freq', None) + ax_freq = _get_ax_freq(ax) if freq is None: # convert irregular if axes has freq info freq = ax_freq @@ -244,7 +263,7 @@ def _maybe_convert_index(ax, data): freq = freq.rule_code if freq is None: - freq = getattr(ax, 'freq', None) + freq = _get_ax_freq(ax) if freq is None: raise ValueError('Could not get frequency alias for plotting') From 08d7b2ca47fce41e9ab0929a64acc54c96ea5731 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 26 Nov 2016 04:18:22 -0500 Subject: [PATCH 101/183] Standardize function signatures (#14645) Standardize the following function signatures: 1) repeat(reps, *args, **kwargs) 2) searchsorted(value, side='left', sorter=None) Closes gh-12662. --- doc/source/whatsnew/v0.20.0.txt | 4 ++++ pandas/core/base.py | 15 ++++++++------- pandas/core/categorical.py | 7 ++++--- pandas/core/series.py | 16 +++++++++------- pandas/indexes/base.py | 7 ++++--- pandas/indexes/multi.py | 5 +++-- pandas/tests/indexes/test_base.py | 19 +++++++++++++++---- pandas/tests/indexes/test_multi.py | 4 ++++ pandas/tests/series/test_analytics.py | 17 +++++++++++++++++ pandas/tests/test_categorical.py | 5 +++++ pandas/tseries/index.py | 13 +++++++------ pandas/tseries/period.py | 22 ++++++++++++---------- pandas/tseries/tdi.py | 15 ++++++++------- pandas/tseries/tests/test_period.py | 3 +++ 14 files changed, 103 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6e3559bee728d..847583e1871f9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -63,6 +63,10 @@ Other API Changes Deprecations ^^^^^^^^^^^^ +- ``Series.repeat()`` has deprecated the ``reps`` parameter in favor of ``repeats`` (:issue:`12662`) +- ``Index.repeat()`` and ``MultiIndex.repeat()`` have deprecated the ``n`` parameter in favor of ``repeats`` (:issue:`12662`) +- ``Categorical.searchsorted()`` and ``Series.searchsorted()`` have deprecated the ``v`` parameter in favor of ``value`` (:issue:`12662`) +- ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`) diff --git a/pandas/core/base.py b/pandas/core/base.py index b9a70292498e4..d412349447794 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1091,12 +1091,12 @@ def factorize(self, sort=False, na_sentinel=-1): """Find indices where elements should be inserted to maintain order. Find the indices into a sorted %(klass)s `self` such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. Parameters ---------- - %(value)s : array_like + value : array_like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. @@ -1109,7 +1109,7 @@ def factorize(self, sort=False, na_sentinel=-1): Returns ------- indices : array of ints - Array of insertion points with the same shape as `v`. + Array of insertion points with the same shape as `value`. See Also -------- @@ -1149,11 +1149,12 @@ def factorize(self, sort=False, na_sentinel=-1): array([3, 4]) # eggs before milk """) - @Substitution(klass='IndexOpsMixin', value='key') + @Substitution(klass='IndexOpsMixin') @Appender(_shared_docs['searchsorted']) - def searchsorted(self, key, side='left', sorter=None): + @deprecate_kwarg(old_arg_name='key', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): # needs coercion on the key (DatetimeIndex does already) - return self.values.searchsorted(key, side=side, sorter=sorter) + return self.values.searchsorted(value, side=side, sorter=sorter) _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fd1a23a5bab7f..922fb84684729 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1076,9 +1076,10 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self._categories.memory_usage(deep=deep) - @Substitution(klass='Categorical', value='v') + @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) - def searchsorted(self, v, side='left', sorter=None): + @deprecate_kwarg(old_arg_name='v', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): if not self.ordered: raise ValueError("Categorical not ordered\nyou can use " ".as_ordered() to change the Categorical to an " @@ -1086,7 +1087,7 @@ def searchsorted(self, v, side='left', sorter=None): from pandas.core.series import Series values_as_codes = self.categories.values.searchsorted( - Series(v).values, side=side) + Series(value).values, side=side) return self.codes.searchsorted(values_as_codes, sorter=sorter) diff --git a/pandas/core/series.py b/pandas/core/series.py index 44d1703fb9b8a..56a3933bded3b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -832,18 +832,19 @@ def _set_values(self, key, value): self._data = self._data.setitem(indexer=key, value=value) self._maybe_update_cacher() - def repeat(self, reps, *args, **kwargs): + @deprecate_kwarg(old_arg_name='reps', new_arg_name='repeats') + def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Series. Refer to `numpy.ndarray.repeat` - for more information about the `reps` argument. + for more information about the `repeats` argument. See also -------- numpy.ndarray.repeat """ nv.validate_repeat(args, kwargs) - new_index = self.index.repeat(reps) - new_values = self._values.repeat(reps) + new_index = self.index.repeat(repeats) + new_values = self._values.repeat(repeats) return self._constructor(new_values, index=new_index).__finalize__(self) @@ -1509,12 +1510,13 @@ def dot(self, other): else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) - @Substitution(klass='Series', value='v') + @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) - def searchsorted(self, v, side='left', sorter=None): + @deprecate_kwarg(old_arg_name='v', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): if sorter is not None: sorter = _ensure_platform_int(sorter) - return self._values.searchsorted(Series(v)._values, + return self._values.searchsorted(Series(value)._values, side=side, sorter=sorter) # ------------------------------------------------------------------- diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 54eaf86315a88..512abfd88c78c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -535,17 +535,18 @@ def tolist(self): """ return list(self.values) - def repeat(self, n, *args, **kwargs): + @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') + def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Index. Refer to `numpy.ndarray.repeat` - for more information about the `n` argument. + for more information about the `repeats` argument. See also -------- numpy.ndarray.repeat """ nv.validate_repeat(args, kwargs) - return self._shallow_copy(self._values.repeat(n)) + return self._shallow_copy(self._values.repeat(repeats)) def where(self, cond, other=None): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f9576d92d8a49..45b6cad89d020 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1166,10 +1166,11 @@ def append(self, other): def argsort(self, *args, **kwargs): return self.values.argsort(*args, **kwargs) - def repeat(self, n, *args, **kwargs): + @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') + def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return MultiIndex(levels=self.levels, - labels=[label.view(np.ndarray).repeat(n) + labels=[label.view(np.ndarray).repeat(repeats) for label in self.labels], names=self.names, sortorder=self.sortorder, verify_integrity=False) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 329e85d82122e..628095a2fcbd3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -17,6 +17,7 @@ Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) +from pandas.core.index import _get_combined_index from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -1976,8 +1977,18 @@ def test_dropna(self): with tm.assertRaisesRegexp(ValueError, msg): pd.Index([1, 2, 3]).dropna(how='xxx') + def test_get_combined_index(self): + result = _get_combined_index([]) + tm.assert_index_equal(result, Index([])) -def test_get_combined_index(): - from pandas.core.index import _get_combined_index - result = _get_combined_index([]) - tm.assert_index_equal(result, Index([])) + def test_repeat(self): + repeats = 2 + idx = pd.Index([1, 2, 3]) + expected = pd.Index([1, 1, 2, 2, 3, 3]) + + result = idx.repeat(repeats) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = idx.repeat(n=repeats) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 61a4ea53f06fb..e1e714719092a 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -97,6 +97,10 @@ def test_repeat(self): numbers, names.repeat(reps)], names=names) tm.assert_index_equal(m.repeat(reps), expected) + with tm.assert_produces_warning(FutureWarning): + result = m.repeat(n=reps) + tm.assert_index_equal(result, expected) + def test_numpy_repeat(self): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6de1a68464436..ad74b4a7e5cda 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1363,6 +1363,10 @@ def test_repeat(self): exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) assert_series_equal(reps, exp) + with tm.assert_produces_warning(FutureWarning): + result = s.repeat(reps=5) + assert_series_equal(result, exp) + to_rep = [2, 3, 4] reps = s.repeat(to_rep) exp = Series(s.values.repeat(to_rep), @@ -1378,6 +1382,19 @@ def test_numpy_repeat(self): msg = "the 'axis' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.repeat, s, 2, axis=0) + def test_searchsorted(self): + s = Series([1, 2, 3]) + + idx = s.searchsorted(1, side='left') + tm.assert_numpy_array_equal(idx, np.array([0], dtype=np.intp)) + + idx = s.searchsorted(1, side='right') + tm.assert_numpy_array_equal(idx, np.array([1], dtype=np.intp)) + + with tm.assert_produces_warning(FutureWarning): + idx = s.searchsorted(v=1, side='left') + tm.assert_numpy_array_equal(idx, np.array([0], dtype=np.intp)) + def test_searchsorted_numeric_dtypes_scalar(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted(30) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index f01fff035a3c5..5320b2216ee40 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1593,6 +1593,11 @@ def test_searchsorted(self): self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) + with tm.assert_produces_warning(FutureWarning): + res = c1.searchsorted(v=['bread']) + exp = np.array([1], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, # whatever is earlier diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 024306edef2d8..0824072cc383f 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1620,15 +1620,16 @@ def normalize(self): return DatetimeIndex(new_values, freq='infer', name=self.name, tz=self.tz) - @Substitution(klass='DatetimeIndex', value='key') + @Substitution(klass='DatetimeIndex') @Appender(_shared_docs['searchsorted']) - def searchsorted(self, key, side='left', sorter=None): - if isinstance(key, (np.ndarray, Index)): - key = np.array(key, dtype=_NS_DTYPE, copy=False) + @deprecate_kwarg(old_arg_name='key', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): + if isinstance(value, (np.ndarray, Index)): + value = np.array(value, dtype=_NS_DTYPE, copy=False) else: - key = _to_m8(key, tz=self.tz) + value = _to_m8(value, tz=self.tz) - return self.values.searchsorted(key, side=side) + return self.values.searchsorted(value, side=side) def is_type_compatible(self, typ): return typ == self.inferred_type or typ == 'datetime' diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index d5d89c8dc2614..4bab3bc14461e 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -39,7 +39,8 @@ from pandas.indexes.base import _index_shared_docs, _ensure_index from pandas import compat -from pandas.util.decorators import Appender, cache_readonly, Substitution +from pandas.util.decorators import (Appender, Substitution, cache_readonly, + deprecate_kwarg) from pandas.lib import infer_dtype import pandas.tslib as tslib from pandas.compat import zip, u @@ -460,18 +461,19 @@ def astype(self, dtype, copy=True, how='start'): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) - @Substitution(klass='PeriodIndex', value='key') + @Substitution(klass='PeriodIndex') @Appender(_shared_docs['searchsorted']) - def searchsorted(self, key, side='left', sorter=None): - if isinstance(key, Period): - if key.freq != self.freq: - msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, key.freqstr) + @deprecate_kwarg(old_arg_name='key', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): + if isinstance(value, Period): + if value.freq != self.freq: + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, value.freqstr) raise IncompatibleFrequency(msg) - key = key.ordinal - elif isinstance(key, compat.string_types): - key = Period(key, freq=self.freq).ordinal + value = value.ordinal + elif isinstance(value, compat.string_types): + value = Period(value, freq=self.freq).ordinal - return self._values.searchsorted(key, side=side, sorter=sorter) + return self._values.searchsorted(value, side=side, sorter=sorter) @property def is_all_dates(self): diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index c1b0936edaff9..7e77d8baf3b2c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -25,7 +25,7 @@ from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat -from pandas.util.decorators import Appender, Substitution +from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.tseries.base import TimelikeOps, DatetimeIndexOpsMixin from pandas.tseries.timedeltas import (to_timedelta, _coerce_scalar_to_timedelta_type) @@ -785,15 +785,16 @@ def _partial_td_slice(self, key, freq, use_lhs=True, use_rhs=True): # # try to find a the dates # return (lhs_mask & rhs_mask).nonzero()[0] - @Substitution(klass='TimedeltaIndex', value='key') + @Substitution(klass='TimedeltaIndex') @Appender(_shared_docs['searchsorted']) - def searchsorted(self, key, side='left', sorter=None): - if isinstance(key, (np.ndarray, Index)): - key = np.array(key, dtype=_TD_DTYPE, copy=False) + @deprecate_kwarg(old_arg_name='key', new_arg_name='value') + def searchsorted(self, value, side='left', sorter=None): + if isinstance(value, (np.ndarray, Index)): + value = np.array(value, dtype=_TD_DTYPE, copy=False) else: - key = _to_m8(key) + value = _to_m8(value) - return self.values.searchsorted(key, side=side, sorter=sorter) + return self.values.searchsorted(value, side=side, sorter=sorter) def is_type_compatible(self, typ): return typ == self.inferred_type or typ == 'timedelta' diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 9bdf420ca6084..fe0d28dd9c508 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3698,6 +3698,9 @@ def test_searchsorted(self): with self.assertRaisesRegexp(period.IncompatibleFrequency, msg): pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + with tm.assert_produces_warning(FutureWarning): + pidx.searchsorted(key=p2) + def test_round_trip(self): p = Period('2000Q1') From 837db725b147104be4b0e5cbfd49868971ee4f78 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 26 Nov 2016 18:18:54 +0100 Subject: [PATCH 102/183] Revert "TST/TEMP: fix pyqt to 4.x for plotting tests" (#14744) --- ci/requirements-2.7-64.run | 1 - ci/requirements-2.7.run | 1 - ci/requirements-3.5-64.run | 1 - ci/requirements-3.5.run | 1 - 4 files changed, 4 deletions(-) diff --git a/ci/requirements-2.7-64.run b/ci/requirements-2.7-64.run index ce085a6ebf91c..42b5a789ae31a 100644 --- a/ci/requirements-2.7-64.run +++ b/ci/requirements-2.7-64.run @@ -16,4 +16,3 @@ bottleneck html5lib beautiful-soup jinja2=2.8 -pyqt=4.11.4 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index eec7886fed38d..560d6571b8771 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -21,4 +21,3 @@ beautiful-soup=4.2.1 statsmodels jinja2=2.8 xarray -pyqt=4.11.4 diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run index 1dc88ed2c94af..96de21e3daa5e 100644 --- a/ci/requirements-3.5-64.run +++ b/ci/requirements-3.5-64.run @@ -10,4 +10,3 @@ numexpr pytables matplotlib blosc -pyqt=4.11.4 diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index d9ce708585a33..333641caf26c4 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -18,7 +18,6 @@ pymysql psycopg2 xarray boto -pyqt=4.11.4 # incompat with conda ATM # beautiful-soup From c5f219acfc0d82449e17ad6cecbac8b9bfa9f4e5 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 28 Nov 2016 04:48:59 -0500 Subject: [PATCH 103/183] BUG: SAS chunksize / iteration issues (#14743) closes #14734 closes #13654 --- doc/source/whatsnew/v0.19.2.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 1 - pandas/io/sas/sas7bdat.py | 10 +++++++++ pandas/io/tests/sas/test_sas7bdat.py | 33 +++++++++++++++++++++++----- pandas/io/tests/sas/test_xport.py | 15 +++++++++++++ 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 74ee466d64a8e..d2394ff25ddd4 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -31,6 +31,7 @@ Bug Fixes - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) +- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 847583e1871f9..ff086380fdb05 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -96,4 +96,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 2a82fd7a53222..91f417abc0502 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -225,6 +225,12 @@ def _get_properties(self): self.os_name = self.os_name.decode( self.encoding or self.default_encoding) + def __next__(self): + da = self.read(nrows=self.chunksize or 1) + if da is None: + raise StopIteration + return da + # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): @@ -591,6 +597,10 @@ def read(self, nrows=None): if self._current_row_in_file_index >= self.row_count: return None + m = self.row_count - self._current_row_in_file_index + if nrows > m: + nrows = m + nd = (self.column_types == b'd').sum() ns = (self.column_types == b's').sum() diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 06eb9774679b1..e20ea48247119 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -47,7 +47,9 @@ def test_from_buffer(self): with open(fname, 'rb') as f: byts = f.read() buf = io.BytesIO(byts) - df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') + rdr = pd.read_sas(buf, format="sas7bdat", + iterator=True, encoding='utf-8') + df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) def test_from_iterator(self): @@ -55,16 +57,35 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + def test_iterator_loop(self): + # github #13654 + for j in 0, 1: + for k in self.test_ix[j]: + for chunksize in 3, 5, 10, 11: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') + y = 0 + for x in rdr: + y += x.shape[0] + self.assertTrue(y == rdr.row_count) + + def test_iterator_read_too_much(self): + # github #14734 + k = self.test_ix[0][0] + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + rdr = pd.read_sas(fname, format="sas7bdat", + iterator=True, encoding='utf-8') + d1 = rdr.read(rdr.row_count + 20) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + d2 = rdr.read(rdr.row_count + 20) + tm.assert_frame_equal(d1, d2) + def test_encoding_options(): dirpath = tm.get_data_path() diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index d0627a80f9604..fe2f7cb4bf4be 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -35,6 +35,13 @@ def test1_basic(self): # Read full file data = read_sas(self.file01, format="xport") tm.assert_frame_equal(data, data_csv) + num_rows = data.shape[0] + + # Test reading beyond end of file + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(num_rows + 100) + self.assertTrue(data.shape[0] == num_rows) + reader.close() # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) @@ -48,6 +55,14 @@ def test1_basic(self): reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) + # Test read in loop + m = 0 + reader = read_sas(self.file01, format="xport", chunksize=100) + for x in reader: + m += x.shape[0] + reader.close() + self.assertTrue(m == num_rows) + # Read full file with `read_sas` method data = read_sas(self.file01) tm.assert_frame_equal(data, data_csv) From 06f26b51e97a0e81e8bd7fca4bba18e57659d963 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 28 Nov 2016 11:19:05 -0500 Subject: [PATCH 104/183] ENH: add data hashing routines (#14729) xref https://github.com/dask/dask/pull/1807 --- asv_bench/benchmarks/algorithms.py | 33 ++++++ pandas/src/hash.pyx | 180 +++++++++++++++++++++++++++++ pandas/tools/hashing.py | 137 ++++++++++++++++++++++ pandas/tools/tests/test_hashing.py | 143 +++++++++++++++++++++++ setup.py | 7 +- 5 files changed, 498 insertions(+), 2 deletions(-) create mode 100644 pandas/src/hash.pyx create mode 100644 pandas/tools/hashing.py create mode 100644 pandas/tools/tests/test_hashing.py diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 9807639143ddb..53b7d55368f6a 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from pandas.util import testing as tm class algorithm(object): @@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + + +class hashing(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + + self.df = pd.DataFrame( + {'A': pd.Series(tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=N))), + 'B': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'D': np.random.randn(N), + 'E': np.arange(N), + 'F': pd.date_range('20110101', freq='s', periods=N), + 'G': pd.timedelta_range('1 day', freq='s', periods=N), + }) + self.df['C'] = self.df['B'].astype('category') + self.df.iloc[10:20] = np.nan + + def time_frame(self): + self.df.hash() + + def time_series_int(self): + self.df.E.hash() + + def time_series_string(self): + self.df.B.hash() + + def time_series_categorical(self): + self.df.C.hash() diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx new file mode 100644 index 0000000000000..b8c309f1f7a13 --- /dev/null +++ b/pandas/src/hash.pyx @@ -0,0 +1,180 @@ +# cython: profile=False +# Translated from the reference implementation +# at https://github.com/veorq/SipHash + +import cython +cimport numpy as cnp +import numpy as np +from numpy cimport ndarray, uint8_t, uint32_t, uint64_t + +from cpython cimport (PyString_Check, + PyBytes_Check, + PyUnicode_Check) +from libc.stdlib cimport malloc, free + +DEF cROUNDS = 2 +DEF dROUNDS = 4 + + +@cython.boundscheck(False) +def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): + """ + Parameters + ---------- + arr : 1-d object ndarray of objects + key : hash key, must be 16 byte len encoded + encoding : encoding for key & arr, default to 'utf8' + + Returns + ------- + 1-d uint64 ndarray of hashes + + """ + cdef: + Py_ssize_t i, l, n + ndarray[uint64_t] result + bytes data, k + uint8_t *kb, *lens + char **vecs, *cdata + object val + + k = key.encode(encoding) + kb = k + if len(k) != 16: + raise ValueError( + 'key should be a 16-byte string encoded, got {!r} (len {})'.format( + k, len(k))) + + n = len(arr) + + # create an array of bytes + vecs = malloc(n * sizeof(char *)) + lens = malloc(n * sizeof(uint8_t)) + + cdef list datas = [] + for i in range(n): + val = arr[i] + if PyString_Check(val): + data = val.encode(encoding) + elif PyBytes_Check(val): + data = val + elif PyUnicode_Check(val): + data = val.encode(encoding) + else: + # non-strings + data = str(val).encode(encoding) + + l = len(data) + lens[i] = l + cdata = data + + # keep the refernce alive thru the end of the + # function + datas.append(data) + vecs[i] = cdata + + result = np.empty(n, dtype=np.uint64) + with nogil: + for i in range(n): + result[i] = low_level_siphash(vecs[i], lens[i], kb) + + free(vecs) + free(lens) + return result + +cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil: + return (x << b) | (x >> (64 - b)) + +cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: + p[0] = (v) + p[1] = (v >> 8) + p[2] = (v >> 16) + p[3] = (v >> 24) + +cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: + u32to8_le(p, v) + u32to8_le(p + 4, (v >> 32)) + +cdef inline uint64_t u8to64_le(uint8_t* p) nogil: + return (p[0] | + p[1] << 8 | + p[2] << 16 | + p[3] << 24 | + p[4] << 32 | + p[5] << 40 | + p[6] << 48 | + p[7] << 56) + +cdef inline void _sipround(uint64_t* v0, uint64_t* v1, + uint64_t* v2, uint64_t* v3) nogil: + v0[0] += v1[0] + v1[0] = _rotl(v1[0], 13) + v1[0] ^= v0[0] + v0[0] = _rotl(v0[0], 32) + v2[0] += v3[0] + v3[0] = _rotl(v3[0], 16) + v3[0] ^= v2[0] + v0[0] += v3[0] + v3[0] = _rotl(v3[0], 21) + v3[0] ^= v0[0] + v2[0] += v1[0] + v1[0] = _rotl(v1[0], 17) + v1[0] ^= v2[0] + v2[0] = _rotl(v2[0], 32) + +cpdef uint64_t siphash(bytes data, bytes key) except? 0: + if len(key) != 16: + raise ValueError( + 'key should be a 16-byte bytestring, got {!r} (len {})'.format( + key, len(key))) + return low_level_siphash(data, len(data), key) + + +@cython.cdivision(True) +cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen, + uint8_t* key) nogil: + cdef uint64_t v0 = 0x736f6d6570736575ULL + cdef uint64_t v1 = 0x646f72616e646f6dULL + cdef uint64_t v2 = 0x6c7967656e657261ULL + cdef uint64_t v3 = 0x7465646279746573ULL + cdef uint64_t b + cdef uint64_t k0 = u8to64_le(key) + cdef uint64_t k1 = u8to64_le(key + 8) + cdef uint64_t m + cdef int i + cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t)) + cdef int left = datalen & 7 + cdef int left_byte + + b = (datalen) << 56 + v3 ^= k1 + v2 ^= k0 + v1 ^= k1 + v0 ^= k0 + + while (data != end): + m = u8to64_le(data) + v3 ^= m + for i in range(cROUNDS): + _sipround(&v0, &v1, &v2, &v3) + v0 ^= m + + data += sizeof(uint64_t) + + for i in range(left-1, -1, -1): + b |= (data[i]) << (i * 8) + + v3 ^= b + + for i in range(cROUNDS): + _sipround(&v0, &v1, &v2, &v3) + + v0 ^= b + v2 ^= 0xff + + for i in range(dROUNDS): + _sipround(&v0, &v1, &v2, &v3) + + b = v0 ^ v1 ^ v2 ^ v3 + + return b diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py new file mode 100644 index 0000000000000..aa18b8bc70c37 --- /dev/null +++ b/pandas/tools/hashing.py @@ -0,0 +1,137 @@ +""" +data hash pandas / numpy objects +""" + +import numpy as np +from pandas import _hash, Series, factorize, Categorical, Index +from pandas.lib import infer_dtype +from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame +from pandas.types.common import is_categorical_dtype + +# 16 byte long hashing key +_default_hash_key = '0123456789123456' + + +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): + """ + Return a data hash of the Index/Series/DataFrame + + .. versionadded:: 0.19.2 + + Parameters + ---------- + index : boolean, default True + include the index in the hash (if Series/DataFrame) + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + Series of uint64, same length as the object + + """ + if hash_key is None: + hash_key = _default_hash_key + + def adder(h, hashed_to_add): + h = np.multiply(h, np.uint(3), h) + return np.add(h, hashed_to_add, h) + + if isinstance(obj, ABCIndexClass): + h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = Series(h, index=obj, dtype='uint64') + elif isinstance(obj, ABCSeries): + h = hash_array(obj.values, encoding, hash_key).astype('uint64') + if index: + h = adder(h, hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key).values) + h = Series(h, index=obj.index, dtype='uint64') + elif isinstance(obj, ABCDataFrame): + cols = obj.iteritems() + first_series = next(cols)[1] + h = hash_array(first_series.values, encoding, + hash_key).astype('uint64') + for _, col in cols: + h = adder(h, hash_array(col.values, encoding, hash_key)) + if index: + h = adder(h, hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key).values) + + h = Series(h, index=obj.index, dtype='uint64') + else: + raise TypeError("Unexpected type for hashing %s" % type(obj)) + return h + + +def hash_array(vals, encoding='utf8', hash_key=None): + """ + Given a 1d array, return an array of deterministic integers. + + .. versionadded:: 0.19.2 + + Parameters + ---------- + vals : ndarray + encoding : string, default 'utf8' + encoding for data & key when strings + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + 1d uint64 numpy array of hash values, same length as the vals + + """ + + # work with cagegoricals as ints. (This check is above the complex + # check so that we don't ask numpy if categorical is a subdtype of + # complex, as it will choke. + if hash_key is None: + hash_key = _default_hash_key + + if is_categorical_dtype(vals.dtype): + vals = vals.codes + + # we'll be working with everything as 64-bit values, so handle this + # 128-bit value early + if np.issubdtype(vals.dtype, np.complex128): + return hash_array(vals.real) + 23 * hash_array(vals.imag) + + # MAIN LOGIC: + inferred = infer_dtype(vals) + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + if inferred == 'boolean': + vals = vals.astype('u8') + + if (np.issubdtype(vals.dtype, np.datetime64) or + np.issubdtype(vals.dtype, np.timedelta64) or + np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: + + vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + else: + + # its MUCH faster to categorize object dtypes, then hash and rename + codes, categories = factorize(vals, sort=False) + categories = Index(categories) + c = Series(Categorical(codes, categories, + ordered=False, fastpath=True)) + vals = _hash.hash_object_array(categories.values, + hash_key, + encoding) + + # rename & extract + vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xbf58476d1ce4e5b9) + vals ^= vals >> 27 + vals *= np.uint64(0x94d049bb133111eb) + vals ^= vals >> 31 + return vals diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py new file mode 100644 index 0000000000000..3e4c77244d2f7 --- /dev/null +++ b/pandas/tools/tests/test_hashing.py @@ -0,0 +1,143 @@ +import numpy as np +import pandas as pd + +from pandas import DataFrame, Series, Index +from pandas.tools.hashing import hash_array, hash_pandas_object +import pandas.util.testing as tm + + +class TestHashing(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.df = DataFrame( + {'i32': np.array([1, 2, 3] * 3, dtype='int32'), + 'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'), + 'cat': Series(['a', 'b', 'c'] * 3).astype('category'), + 'obj': Series(['d', 'e', 'f'] * 3), + 'bool': np.array([True, False, True] * 3), + 'dt': Series(pd.date_range('20130101', periods=9)), + 'dt_tz': Series(pd.date_range('20130101', periods=9, + tz='US/Eastern')), + 'td': Series(pd.timedelta_range('2000', periods=9))}) + + def test_consistency(self): + # check that our hash doesn't change because of a mistake + # in the actual code; this is the ground truth + result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) + expected = Series(np.array([3600424527151052760, 1374399572096150070, + 477881037637427054], dtype='uint64'), + index=['foo', 'bar', 'baz']) + tm.assert_series_equal(result, expected) + + def test_hash_array(self): + for name, s in self.df.iteritems(): + a = s.values + tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) + + def check_equal(self, obj, **kwargs): + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + kwargs.pop('index', None) + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + def check_not_equal_with_index(self, obj): + + # check that we are not hashing the same if + # we include the index + if not isinstance(obj, Index): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + self.assertFalse((a == b).all()) + + def test_hash_pandas_object(self): + + for obj in [Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(['a', 'b', 'c']), + Series(['a', np.nan, 'c']), + Series([True, False, True]), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + Series([1, 2, 3], index=pd.MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1)]))]: + self.check_equal(obj) + self.check_not_equal_with_index(obj) + + def test_hash_pandas_object2(self): + for name, s in self.df.iteritems(): + self.check_equal(s) + self.check_not_equal_with_index(s) + + def test_hash_pandas_empty_object(self): + for obj in [Series([], dtype='float64'), + Series([], dtype='object'), + Index([])]: + self.check_equal(obj) + + # these are by-definition the same with + # or w/o the index as the data is empty + + def test_errors(self): + + for obj in [pd.Timestamp('20130101'), tm.makePanel()]: + def f(): + hash_pandas_object(f) + + self.assertRaises(TypeError, f) + + def test_hash_keys(self): + # using different hash keys, should have different hashes + # for the same data + + # this only matters for object dtypes + obj = Series(list('abc')) + a = hash_pandas_object(obj, hash_key='9876543210123456') + b = hash_pandas_object(obj, hash_key='9876543210123465') + self.assertTrue((a != b).all()) + + def test_invalid_key(self): + # this only matters for object dtypes + def f(): + hash_pandas_object(Series(list('abc')), hash_key='foo') + self.assertRaises(ValueError, f) + + def test_mixed(self): + # mixed objects + obj = Series(['1', 2, 3]) + self.check_equal(obj) + self.check_not_equal_with_index(obj) + + # mixed are actually equal when stringified + a = hash_pandas_object(obj) + b = hash_pandas_object(Series(list('123'))) + self.assert_series_equal(a, b) + + def test_alread_encoded(self): + # if already encoded then ok + + obj = Series(list('abc')).str.encode('utf8') + self.check_equal(obj) + + def test_alternate_encoding(self): + + obj = Series(list('abc')) + self.check_equal(obj, encoding='ascii') + + def test_long_strings(self): + + obj = Index(tm.rands_array(nchars=10000, size=100)) + self.check_equal(obj) diff --git a/setup.py b/setup.py index 2dd3fec150781..8d2e2669852ea 100755 --- a/setup.py +++ b/setup.py @@ -331,6 +331,7 @@ class CheckSDist(sdist_class): 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', 'pandas/src/testing.pyx', + 'pandas/src/hash.pyx', 'pandas/io/sas/saslib.pyx'] def initialize_options(self): @@ -501,10 +502,12 @@ def pxd(name): 'sources': ['pandas/src/parser/tokenizer.c', 'pandas/src/parser/io.c']}, _sparse={'pyxfile': 'src/sparse', - 'depends': ([srcpath('sparse', suffix='.pyx')] - + _pxi_dep['_sparse'])}, + 'depends': ([srcpath('sparse', suffix='.pyx')] + + _pxi_dep['_sparse'])}, _testing={'pyxfile': 'src/testing', 'depends': [srcpath('testing', suffix='.pyx')]}, + _hash={'pyxfile': 'src/hash', + 'depends': [srcpath('hash', suffix='.pyx')]}, ) ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'} From 2f43ac4c4cdea407b42530bcfef490a18892302a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Nov 2016 12:30:16 -0500 Subject: [PATCH 105/183] BF: (re)raise the exception always unless returning (#14756) otherwise leads atm to masking of this error while testing on i386 and then failling since UnboundLocalError: local variable unser referenced before assignment More detail: https://buildd.debian.org/status/fetch.php?pkg=pandas&arch=i386&ver=0.19.1-1&stamp=1479504883 --- pandas/io/tests/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index ba02e9186f1df..e6e6f33669e17 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -167,7 +167,7 @@ def _check_orient(df, orient, dtype=None, numpy=False, if raise_ok is not None: if isinstance(detail, raise_ok): return - raise + raise if sort is not None and sort in unser.columns: unser = unser.sort_values(sort) From dfeae396c832358e0a2f64e67bd48ca83a9c4976 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 29 Nov 2016 16:18:28 -0500 Subject: [PATCH 106/183] BUG: Improve error message for skipfooter malformed rows in Python engine (#14749) Python's native CSV library does not respect the skipfooter parameter, so if one of those skipped rows is malformed, it will still raise an error. Closes gh-13879. --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/parsers.py | 23 ++++++++++++++------ pandas/io/tests/parser/python_parser_only.py | 15 +++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d2394ff25ddd4..6ee6271929008 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -32,6 +32,7 @@ Bug Fixes - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) - Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. +- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 94eb015701004..580a3398bb66a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2411,14 +2411,23 @@ def _next_line(self): try: orig_line = next(self.data) except csv.Error as e: + msg = str(e) + if 'NULL byte' in str(e): - raise csv.Error( - 'NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead.') - else: - raise + msg = ('NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead') + + if self.skipfooter > 0: + reason = ('Error could possibly be due to ' + 'parsing errors in the skipped footer rows ' + '(the skipfooter keyword is only applied ' + 'after Python\'s csv library has parsed ' + 'all rows).') + msg += '. ' + reason + + raise csv.Error(msg) line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 55801b4a9788e..ad62aaa275127 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -221,3 +221,18 @@ def test_multi_char_sep_quotes(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) + + def test_skipfooter_bad_row(self): + # see gh-13879 + + data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz' + msg = 'parsing errors in the skipped footer rows' + + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data), skipfooter=1) + + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data)) From e3de0526644203ff667d533fdc82c617363f016e Mon Sep 17 00:00:00 2001 From: Tara Adiseshan Date: Tue, 29 Nov 2016 18:33:47 -0500 Subject: [PATCH 107/183] added read_msgpack() to index (#14765) --- doc/source/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index a510f663d19ee..638abd5421862 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -27,6 +27,7 @@ Flat File read_table read_csv read_fwf + read_msgpack Clipboard ~~~~~~~~~ From 43c24e621d054412ec21b5ec599c5e7f4281f81b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 29 Nov 2016 18:37:29 -0500 Subject: [PATCH 108/183] BLD: clean .pxi when cleaning (#14766) --- setup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.py b/setup.py index 8d2e2669852ea..2bef65c9719dc 100755 --- a/setup.py +++ b/setup.py @@ -293,6 +293,11 @@ def initialize_options(self): if d == '__pycache__': self._clean_trees.append(pjoin(root, d)) + # clean the generated pxi files + for pxifile in _pxifiles: + pxifile = pxifile.replace(".pxi.in", ".pxi") + self._clean_me.append(pxifile) + for d in ('build', 'dist'): if os.path.exists(d): self._clean_trees.append(d) From 423c16a2ee88d82202c1e6b24a31d47ec6a04b82 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 29 Nov 2016 19:20:16 -0500 Subject: [PATCH 109/183] BLD/DOC: use new secure key for pandas-docs-travis --- .travis.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 49765c9df96ea..8a8e954429eb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,13 +14,10 @@ cache: env: global: - # scatterci API key - #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" - # ironcache API key - #- secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" - #- secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" - # pandas-docs-bot GH - - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" + + # pandas-docs-travis GH + - secure: "B3m5+dbk7uPz3lpx8BWDbyEL27M5NcmKzAmCwDRuFRsUI0PaAxS5B15wOKPLQHVbfVyl1eXd/o1JtuM+mQudj163tOx3vVgTR3oSTxBWYVtwFauKFsJmPhyIYjuBZ8X3Zs8dS0jH3wKNYZzW0qE1NPNbRaEm8RXWZxmNJwU166M=" + ## original key - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" git: # for cloning From de1132d878ee25885a6cae55816914292989e6ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 30 Nov 2016 06:01:28 -0500 Subject: [PATCH 110/183] ERR: raise on python in object hashing, only supporting strings, nulls xref #14729 Author: Jeff Reback Closes #14767 from jreback/hashing_object and squashes the following commits: 9a5a5d4 [Jeff Reback] ERR: raise on python in object hashing, only supporting strings, nulls --- pandas/src/hash.pyx | 14 ++++++++++++-- pandas/tools/tests/test_hashing.py | 27 ++++++++++++++++----------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx index b8c309f1f7a13..6c0c7804edd05 100644 --- a/pandas/src/hash.pyx +++ b/pandas/src/hash.pyx @@ -7,6 +7,7 @@ cimport numpy as cnp import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t +from util cimport _checknull from cpython cimport (PyString_Check, PyBytes_Check, PyUnicode_Check) @@ -29,6 +30,11 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): ------- 1-d uint64 ndarray of hashes + Notes + ----- + allowed values must be strings, or nulls + mixed array types will raise TypeError + """ cdef: Py_ssize_t i, l, n @@ -60,10 +66,14 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): data = val elif PyUnicode_Check(val): data = val.encode(encoding) - else: - # non-strings + elif _checknull(val): + # null, stringify and encode data = str(val).encode(encoding) + else: + raise TypeError("{} of type {} is not a valid type for " + "hashing, must be string or null".format(val, type(val))) + l = len(data) lens[i] = l cdata = data diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 3e4c77244d2f7..4e05ae7007c80 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -63,6 +63,7 @@ def test_hash_pandas_object(self): Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), + Series(['a', None, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), @@ -71,9 +72,7 @@ def test_hash_pandas_object(self): tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)]))]: + tm.makeTimedeltaIndex()]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -115,16 +114,22 @@ def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) - def test_mixed(self): - # mixed objects + def test_unsupported_objects(self): + + # mixed objects are not supported obj = Series(['1', 2, 3]) - self.check_equal(obj) - self.check_not_equal_with_index(obj) - # mixed are actually equal when stringified - a = hash_pandas_object(obj) - b = hash_pandas_object(Series(list('123'))) - self.assert_series_equal(a, b) + def f(): + hash_pandas_object(obj) + self.assertRaises(TypeError, f) + + # MultiIndex are represented as tuples + obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1)])) + + def f(): + hash_pandas_object(obj) + self.assertRaises(TypeError, f) def test_alread_encoded(self): # if already encoded then ok From 11ca57ffa164de3498ccad8fb3571e5fd604a3ed Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 30 Nov 2016 06:03:10 -0500 Subject: [PATCH 111/183] DOC: Remove SparseSeries from SparseArray doc (#14769) --- pandas/sparse/array.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index a15def65cad7e..4bb36446c9ff7 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -547,8 +547,8 @@ def astype(self, dtype=None, copy=True): def copy(self, deep=True): """ - Make a copy of the SparseSeries. Only the actual sparse values need to - be copied + Make a copy of the SparseArray. Only the actual sparse values need to + be copied. """ if deep: values = self.sp_values.copy() @@ -559,9 +559,9 @@ def copy(self, deep=True): def count(self): """ - Compute sum of non-NA/null observations in SparseSeries. If the + Compute sum of non-NA/null observations in SparseArray. If the fill_value is not NaN, the "sparse" locations will be included in the - observation count + observation count. Returns ------- From e299560dff3c3620506d7326aef82b68d6a7e3e6 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 30 Nov 2016 06:44:52 -0500 Subject: [PATCH 112/183] PERF: Improve replace perf When .replace is called with `dict`, replacements are done per value. Current impl try to soft convert the dtype in every replacement, but it is enough to be done in the final replacement. Author: sinhrks Closes #12745 from sinhrks/replace_perf and squashes the following commits: ffc59b0 [sinhrks] PERF: Improve replace perf --- asv_bench/benchmarks/replace.py | 24 ++++++++++++++++++++++++ doc/source/whatsnew/v0.19.2.txt | 1 + pandas/core/generic.py | 23 +++++++++++++++-------- pandas/core/internals.py | 17 ++++++++++++----- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 869ddd8d6fa49..66b8af53801ac 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -32,6 +32,30 @@ def time_replace_large_dict(self): self.s.replace(self.to_rep, inplace=True) +class replace_convert(object): + goal_time = 0.5 + + def setup(self): + self.n = (10 ** 3) + self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) + self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) + self.s = Series(np.random.randint(self.n, size=(10 ** 3))) + self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), + 'B': np.random.randint(self.n, size=(10 ** 3))}) + + def time_replace_series_timestamp(self): + self.s.replace(self.to_ts) + + def time_replace_series_timedelta(self): + self.s.replace(self.to_td) + + def time_replace_frame_timestamp(self): + self.df.replace(self.to_ts) + + def time_replace_frame_timedelta(self): + self.df.replace(self.to_td) + + class replace_replacena(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 6ee6271929008..cafbdb731f494 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -21,6 +21,7 @@ Highlights include: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of ``.replace()`` (:issue:`12745`) .. _whatsnew_0192.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fbc6333dd6fdd..27ca817c19a63 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, res = self if inplace else self.copy() for c, src in compat.iteritems(to_replace): if c in value and c in self: + # object conversion is handled in + # series.replace which is called recursivelly res[c] = res[c].replace(to_replace=src, value=value[c], - inplace=False, regex=regex) + inplace=False, + regex=regex) return None if inplace else res # {'A': NA} -> 0 elif not is_list_like(value): - for k, src in compat.iteritems(to_replace): - if k in self: - new_data = new_data.replace(to_replace=src, - value=value, - filter=[k], - inplace=inplace, - regex=regex) + keys = [(k, src) for k, src in compat.iteritems(to_replace) + if k in self] + keys_len = len(keys) - 1 + for i, (k, src) in enumerate(keys): + convert = i == keys_len + new_data = new_data.replace(to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex, + convert=convert) else: raise TypeError('value argument must be scalar, dict, or ' 'Series') diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 43beefffd448e..120a9cbcd1a75 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -622,7 +622,6 @@ def replace(self, to_replace, value, inplace=False, filter=None, original_to_replace = to_replace mask = isnull(self.values) - # try to replace, if we raise an error, convert to ObjectBlock and # retry try: @@ -1795,13 +1794,14 @@ def should_store(self, value): return issubclass(value.dtype.type, np.bool_) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, mgr=None): + regex=False, convert=True, mgr=None): to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self return super(BoolBlock, self).replace(to_replace, value, inplace=inplace, filter=filter, - regex=regex, mgr=mgr) + regex=regex, convert=convert, + mgr=mgr) class ObjectBlock(Block): @@ -3214,6 +3214,7 @@ def comp(s): masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] + src_len = len(src_list) - 1 for blk in self.blocks: # its possible to get multiple result blocks here @@ -3223,8 +3224,9 @@ def comp(s): new_rb = [] for b in rb: if b.dtype == np.object_: + convert = i == src_len result = b.replace(s, d, inplace=inplace, regex=regex, - mgr=mgr) + mgr=mgr, convert=convert) new_rb = _extend_blocks(result, new_rb) else: # get our mask for this element, sized to this @@ -4788,7 +4790,12 @@ def _putmask_smart(v, m, n): # change the dtype dtype, _ = _maybe_promote(n.dtype) - nv = v.astype(dtype) + + if is_extension_type(v.dtype) and is_object_dtype(dtype): + nv = v.get_values(dtype) + else: + nv = v.astype(dtype) + try: nv[m] = n[m] except ValueError: From 1efa51c05e3415add28263a944ae69e81c5c547f Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Wed, 30 Nov 2016 09:11:05 -0600 Subject: [PATCH 113/183] DOC/TST: dtype param in read_fwf (#14768) --- doc/source/io.rst | 11 +++++++++++ doc/source/whatsnew/v0.20.0.txt | 9 +++++++++ pandas/io/tests/parser/test_read_fwf.py | 20 ++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index b1c151def26af..f524d37d0de60 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1268,11 +1268,22 @@ is whitespace). df = pd.read_fwf('bar.csv', header=None, index_col=0) df +.. versionadded:: 0.20.0 + +``read_fwf`` supports the ``dtype`` parameter for specifying the types of +parsed columns to be different from the inferred type. + +.. ipython:: python + + pd.read_fwf('bar.csv', header=None, index_col=0).dtypes + pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes + .. ipython:: python :suppress: os.remove('bar.csv') + Indexes ''''''' diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ff086380fdb05..6fe0ad8092a03 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -34,6 +34,15 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t pd.read_csv(StringIO(data), engine='python').dtypes pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes +The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing +fixed-width text files. + +.. ipython:: python + + data = "a b\n1 2\n3 4" + pd.read_fwf(StringIO(data)).dtypes + pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes + .. _whatsnew_0200.enhancements.other: Other enhancements diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index 11b10211650d6..42b1116280a1e 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -345,3 +345,23 @@ def test_variable_width_unicode(self): header=None, encoding='utf8') tm.assert_frame_equal(expected, read_fwf( BytesIO(test.encode('utf8')), header=None, encoding='utf8')) + + def test_dtype(self): + data = ''' a b c +1 2 3.2 +3 4 5.2 +''' + colspecs = [(0, 5), (5, 10), (10, None)] + result = pd.read_fwf(StringIO(data), colspecs=colspecs) + expected = pd.DataFrame({ + 'a': [1, 3], + 'b': [2, 4], + 'c': [3.2, 5.2]}, columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype(str) + expected['c'] = expected['c'].astype('int32') + result = pd.read_fwf(StringIO(data), colspecs=colspecs, + dtype={'a': 'float64', 'b': str, 'c': 'int32'}) + tm.assert_frame_equal(result, expected) From 87beca3d0d9142c646577010f604cc87fe04a652 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Nov 2016 16:17:51 +0100 Subject: [PATCH 114/183] PEP8: fix line length --- pandas/src/hash.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx index 6c0c7804edd05..a393e0df96954 100644 --- a/pandas/src/hash.pyx +++ b/pandas/src/hash.pyx @@ -71,8 +71,8 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): data = str(val).encode(encoding) else: - raise TypeError("{} of type {} is not a valid type for " - "hashing, must be string or null".format(val, type(val))) + raise TypeError("{} of type {} is not a valid type for hashing, " + "must be string or null".format(val, type(val))) l = len(data) lens[i] = l From 2bd9c95ffe6e026c035327246a63c8a89e858ddb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Nov 2016 17:01:27 +0100 Subject: [PATCH 115/183] DOC: specify link to frequencies (#14760) --- pandas/core/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index afddb86988970..ea26f5c0d29b8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -175,8 +175,8 @@ class Grouper(object): freq : string / frequency object, defaults to None This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification - of available frequencies, please see - `here `_. + of available frequencies, please see `here + `_. axis : number/name of the axis, defaults to 0 sort : boolean, default to False whether to sort the resulting labels From 1b0333b70363bf1d0c4cd17f57b29aa2a192acc9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 30 Nov 2016 18:59:51 -0500 Subject: [PATCH 116/183] BLD: update pandas-docs github token again --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8a8e954429eb0..5b5cfc1ac1628 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,8 @@ env: global: # pandas-docs-travis GH - - secure: "B3m5+dbk7uPz3lpx8BWDbyEL27M5NcmKzAmCwDRuFRsUI0PaAxS5B15wOKPLQHVbfVyl1eXd/o1JtuM+mQudj163tOx3vVgTR3oSTxBWYVtwFauKFsJmPhyIYjuBZ8X3Zs8dS0jH3wKNYZzW0qE1NPNbRaEm8RXWZxmNJwU166M=" + - secure: "U4GkUaX0K5FqHsHlxXiTr53t0zg8l9p3x7Xze3T0l4mEfhJdqVjayizRE0w0Uo3D54YY7X4lCRtI+bzFz20RxAEoEUyIoWtlUP7eNY3XuhViipY7gtYJpS+68VN5MnChzzz73cNj89fLBvCFyYhMTXHSrbm+yHSg6eRlqzzhHWc=" + ## original key - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" git: From 725453deb2efae1b84d87131e785a42a22753659 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 1 Dec 2016 15:40:37 -0500 Subject: [PATCH 117/183] BLD: restore original pandas-docs key --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5b5cfc1ac1628..4a0c6d77fcf45 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,9 +16,7 @@ env: global: # pandas-docs-travis GH - - secure: "U4GkUaX0K5FqHsHlxXiTr53t0zg8l9p3x7Xze3T0l4mEfhJdqVjayizRE0w0Uo3D54YY7X4lCRtI+bzFz20RxAEoEUyIoWtlUP7eNY3XuhViipY7gtYJpS+68VN5MnChzzz73cNj89fLBvCFyYhMTXHSrbm+yHSg6eRlqzzhHWc=" - - ## original key - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" + - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" git: # for cloning From b3083278b676337a5ceafec0303adbad108036e6 Mon Sep 17 00:00:00 2001 From: Jun Kim Date: Fri, 2 Dec 2016 21:51:50 +0900 Subject: [PATCH 118/183] Fix typo at pandas/core/generic.py (#14787) indentifier -> identifier --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 27ca817c19a63..7868969f477b0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1095,7 +1095,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): ---------- path_or_buf : the path (string) or HDFStore object key : string - indentifier for the group in the store + identifier for the group in the store mode : optional, {'a', 'w', 'r+'}, default 'a' ``'w'`` From b787468ca2af344baac83758fea3026ebd8a419c Mon Sep 17 00:00:00 2001 From: Dody Suria Wijaya Date: Fri, 2 Dec 2016 06:52:54 -0600 Subject: [PATCH 119/183] Fix a simple typo (#14785) --- pandas/core/ops.py | 2 +- pandas/io/tests/test_pytables.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 7cff1104c50be..96b447cda4bc4 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -421,7 +421,7 @@ def _validate(self, lvalues, rvalues, name): # if tz's must be equal (same or None) if getattr(lvalues, 'tz', None) != getattr(rvalues, 'tz', None): - raise ValueError("Incompatbile tz's on datetime subtraction " + raise ValueError("Incompatible tz's on datetime subtraction " "ops") elif ((self.is_timedelta_lhs or self.is_offset_lhs) and diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 72973105ff3bd..aa59a74606674 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1947,7 +1947,7 @@ def test_append_raise(self): self.assertRaises(TypeError, store.append, 'df', Series(np.arange(10))) - # appending an incompatbile table + # appending an incompatible table df = tm.makeDataFrame() store.append('df', df) From 3d73d0501bacaeb2c3c1686c0e936cf3707887c2 Mon Sep 17 00:00:00 2001 From: Thrasibule Date: Fri, 2 Dec 2016 17:27:27 -0500 Subject: [PATCH 120/183] Small typos (#14789) --- pandas/tseries/offsets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index efcde100d1ce7..abb7018b2e25b 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -155,14 +155,14 @@ class DateOffset(object): DateOffsets can be created to move dates forward a given number of valid dates. For example, Bday(2) can be added to a date to move it two business days forward. If the date does not start on a - valid date, first it is moved to a valid date. Thus psedo code + valid date, first it is moved to a valid date. Thus pseudo code is: def __add__(date): date = rollback(date) # does nothing if date is valid return date + - When a date offset is created for a negitive number of periods, + When a date offset is created for a negative number of periods, the date is first rolled forward. The pseudo code is: def __add__(date): From 56c3aaeea3c55f098b803caf6318917d79ab2790 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Sat, 3 Dec 2016 05:11:59 -0500 Subject: [PATCH 121/183] ENH: support cut/qcut for datetime/timedelta (GH14714) (#14737) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/tests/test_tile.py | 30 ++++++++ pandas/tools/tile.py | 121 ++++++++++++++++++++++++-------- 3 files changed, 121 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe0ad8092a03..5e94a95e38cbb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,6 +50,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index e5b9c65b515d6..33d2a01b1256e 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -12,6 +12,7 @@ from pandas.core.algorithms import quantile from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod +from pandas import to_datetime, DatetimeIndex class TestCut(tm.TestCase): @@ -283,6 +284,35 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + def test_datetime_cut(self): + # GH 14714 + # testing for time data to be present as series + data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_series_equal(result, expected) + + # testing for time data to be present as list + data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')] + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + + # testing for time data to be present as ndarray + data = np.array([np.datetime64('2013-01-01'), + np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')]) + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + + # testing for time data to be present as datetime index + data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index ef75f2f84779b..f62bac9e951a7 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -11,6 +11,8 @@ import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip +from pandas import to_timedelta, to_datetime +from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype import numpy as np @@ -81,14 +83,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + + # for handling the cut for datetime and timedelta objects + x_is_series, series_index, name, x = _preprocess_for_cut(x) + x, dtype = _coerce_to_type(x) + if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - try: # for array-like - sz = x.size - except AttributeError: - x = np.asarray(x) - sz = x.size + + sz = x.size + if sz == 0: raise ValueError('Cannot cut empty array') # handle empty arrays. Can't determine range, so use 0-1. @@ -114,9 +119,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, - include_lowest=include_lowest) + fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, + precision=precision, + include_lowest=include_lowest, dtype=dtype) + + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) def qcut(x, q, labels=None, retbins=False, precision=3): @@ -166,26 +174,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ + x_is_series, series_index, name, x = _preprocess_for_cut(x) + + x, dtype = _coerce_to_type(x) + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + fac, bins = _bins_to_cuts(x, bins, labels=labels, + precision=precision, include_lowest=True, + dtype=dtype) + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) -def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, - precision=3, name=None, include_lowest=False): - x_is_series = isinstance(x, Series) - series_index = None - - if x_is_series: - series_index = x.index - if name is None: - name = x.name - x = np.asarray(x) +def _bins_to_cuts(x, bins, right=True, labels=None, + precision=3, include_lowest=False, + dtype=None): side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) @@ -205,7 +213,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, while True: try: levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest) + include_lowest=include_lowest, + dtype=dtype) except ValueError: increases += 1 precision += 1 @@ -229,18 +238,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) - if x_is_series: - fac = Series(fac, index=series_index, name=name) - - if not retbins: - return fac - return fac, bins def _format_levels(bins, prec, right=True, - include_lowest=False): - fmt = lambda v: _format_label(v, precision=prec) + include_lowest=False, dtype=None): + fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) if right: levels = [] for a, b in zip(bins, bins[1:]): @@ -258,12 +261,16 @@ def _format_levels(bins, prec, right=True, else: levels = ['[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] - return levels -def _format_label(x, precision=3): +def _format_label(x, precision=3, dtype=None): fmt_str = '%%.%dg' % precision + + if is_datetime64_dtype(dtype): + return to_datetime(x, unit='ns') + if is_timedelta64_dtype(dtype): + return to_timedelta(x, unit='ns') if np.isinf(x): return str(x) elif is_float(x): @@ -300,3 +307,55 @@ def _trim_zeros(x): if len(x) > 1 and x[-1] == '.': x = x[:-1] return x + + +def _coerce_to_type(x): + """ + if the passed data is of datetime/timedelta type, + this method converts it to integer so that cut method can + handle it + """ + dtype = None + + if is_timedelta64_dtype(x): + x = to_timedelta(x).view(np.int64) + dtype = np.timedelta64 + elif is_datetime64_dtype(x): + x = to_datetime(x).view(np.int64) + dtype = np.datetime64 + + return x, dtype + + +def _preprocess_for_cut(x): + """ + handles preprocessing for cut where we convert passed + input to array, strip the index information and store it + seperately + """ + x_is_series = isinstance(x, Series) + series_index = None + name = None + + if x_is_series: + series_index = x.index + name = x.name + + x = np.asarray(x) + + return x_is_series, series_index, name, x + + +def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): + """ + handles post processing for the cut method where + we combine the index information if the originally passed + datatype was a series + """ + if x_is_series: + fac = Series(fac, index=series_index, name=name) + + if not retbins: + return fac + + return fac, bins From 588e29d23fa289916a8b057d093af62f35c6dcf6 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Sat, 3 Dec 2016 06:37:32 -0600 Subject: [PATCH 122/183] API: add dtype param to read_excel (#14786) * API add dtype param to read_excel * doc fixup --- doc/source/io.rst | 14 ++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 6 +++--- pandas/io/excel.py | 17 +++++++++++++---- pandas/io/parsers.py | 4 ++-- pandas/io/tests/data/testdtype.xls | Bin 0 -> 22528 bytes pandas/io/tests/data/testdtype.xlsm | Bin 0 -> 8517 bytes pandas/io/tests/data/testdtype.xlsx | Bin 0 -> 8501 bytes pandas/io/tests/test_excel.py | 27 +++++++++++++++++++++++++++ 8 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 pandas/io/tests/data/testdtype.xls create mode 100644 pandas/io/tests/data/testdtype.xlsm create mode 100644 pandas/io/tests/data/testdtype.xlsx diff --git a/doc/source/io.rst b/doc/source/io.rst index f524d37d0de60..f22374553e9c3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2538,6 +2538,20 @@ missing data to recover integer dtype: cfun = lambda x: int(x) if x else -1 read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) +dtype Specifications +++++++++++++++++++++ + +.. versionadded:: 0.20 + +As an alternative to converters, the type for an entire column can +be specified using the `dtype` keyword, which takes a dictionary +mapping column names to types. To interpret data with +no type inference, use the type ``str`` or ``object``. + +.. code-block:: python + + read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + .. _io.excel_writer: Writing Excel Files diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5e94a95e38cbb..f172f70932d60 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,8 @@ New features ~~~~~~~~~~~~ -``read_csv`` supports ``dtype`` keyword for python engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``dtype`` keyword for data io +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. @@ -35,7 +35,7 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing -fixed-width text files. +fixed-width text files, and :func:`read_excel` for parsing Excel files. .. ipython:: python diff --git a/pandas/io/excel.py b/pandas/io/excel.py index d3171ceedfc03..6b7c597ecfcdc 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -87,6 +87,14 @@ either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. +dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 0.20.0 + true_values : list, default None Values to consider as True @@ -184,8 +192,8 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, - true_values=None, false_values=None, engine=None, squeeze=False, - **kwds): + dtype=None, true_values=None, false_values=None, engine=None, + squeeze=False, **kwds): if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -195,7 +203,7 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, convert_float=convert_float, has_index_names=has_index_names, - skip_footer=skip_footer, converters=converters, + skip_footer=skip_footer, converters=converters, dtype=dtype, true_values=true_values, false_values=false_values, squeeze=squeeze, **kwds) @@ -318,7 +326,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, - squeeze=False, **kwds): + dtype=None, squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -501,6 +509,7 @@ def _parse_cell(cell_contents, cell_typ): skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, + dtype=dtype, **kwds) output[asheetname] = parser.read() diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 580a3398bb66a..ef839297c80d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -18,7 +18,7 @@ from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, is_float, is_dtype_equal, - is_object_dtype, + is_object_dtype, is_string_dtype, is_scalar, is_categorical_dtype) from pandas.types.missing import isnull from pandas.types.cast import _astype_nansafe @@ -1329,7 +1329,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool=False) else: # skip inference if specified dtype is object - try_num_bool = not (cast_type and is_object_dtype(cast_type)) + try_num_bool = not (cast_type and is_string_dtype(cast_type)) # general type inference and conversion cvals, na_count = self._infer_types( diff --git a/pandas/io/tests/data/testdtype.xls b/pandas/io/tests/data/testdtype.xls new file mode 100644 index 0000000000000000000000000000000000000000..f63357524324ff6ff97f4b1a15b6a371957ec6e1 GIT binary patch literal 22528 zcmeHP32YqY6@If`+v_8V?Ig}gvhk7FjvXA^3FPuRF;|3h2tq@JX>jayY{ju7$A&~A zjcGxuN=m_@K_za12Gj!z4M#)O&;$vADw!5hC?$GdnZh zZE4l2Gpm1g=FR`U_vXF#zyH|Ze(C4MZ#{5q_S<4Rnk7p(R|dVeRkdt`lqFJ)4_M87HDNPTP1wQ_ky=@46{f#|Ri0AKBr+073SBOX(UOuv z4gQ2&gIS3-bIy?mrBq}ej`q4;yV~ozy4u4%Ju7$ZTDfP>TQUvq6ZvZ_tiJWrsW9nC zv_Ab)v;%FcNtj>y5wO|$$v8cDpVj^e`uAPv-*cfq>Ow!_LVwJK{)h|x8!q(QUFdH5 zxZyvcaHhj7Dn1i-_@)b8*C(~Uu1_lcK^L53F7zi|=s$6xf7ylpBNuw6`aGp@3R3K& z$H^3fes^*4Jq7m^Nbs<#U!0(Kcb}Ypa=rwQD|$9kjI#6JQ9&I>d{(>AYhCCyj`XR( zRQP|1!;6leZ_pF|)#EwWpuZxUaXBBuFQq5Og*uK-$Ls6#yd*u0OT4~~Zj(F7sGqQp z9{(7I8u&Nha<+j}XwVaSJ{-4Oy#82macMy*?0m1HPd4fUU$?Vv|7>)ZPIl%5$H_j^ z4Eie=0ZMbi4kZTN_FjU7pT~ZXj69HMZVpY8n`yYj5s_b(-xS;*s+0W|Ix7TZ5}31a z#nC7GR`g0WlhGptQW;n!BL;mGn1KW;l97a9rVQ-rGLk@FWyHW>&WM3d$%ujBpAiGw zT}BK{G#N3l#bm_5CYccf+gL^nOp2ZiYi;d;*&xfkJp@%liFPyO^Z{Zxygy_IVaiPd zgkk6o!uFm92t(5ygh@CJ5QeKe2op;hAPix55O&9DfH17xK`^kL7-YQ7+dG(@qyysZ zotf!@czb77dLZ83L5G#j>v(%-PI@5T-odoaqzP`UeTm+78XCR5gK1wnAl}}YmmY|> zchK{uqlve7!s&r{duKs4A8AXJL9E-riZ1 z9*DPhsuCa^FO5k~%rXvR+u?)Q3qOk`)8k#X&Z`utH_J{0Bap7K9iUT_yj&JtRLl31W6^TSrlrqx{rAX7EMGU3P zwA4}TVpR7W7ud(iwV5*1Wy-K5Q-%u@4D>b~XfbT+u%sT<^e&hTm|a*Ij2L_VPv#i2 zk8Q9jWlAG-sYNPdheYo*f~B~LL1Gbakj8XC?A?vsuT20+213f~q=xe{FeKYpByyv( z5o6mWm6Cg$kdi?RnQ(C*3X0&$FAVnQUYd6zUeYOI1%|44b`^+gXn$wM8Ml;P&%A9Tt zrOdR{6oeIGQ{<&sDwScCg~1-G+Wycrw(ejx7=rF|8QR_6RL#xT=nhgfS(DI&;Ra_| z?U~BZoM529ly^N8D3Ky4Dh>6O;kxkJ&dzB6(9+S+1{r178$Yz*yx1hH;@3;1gryE; zt#slOm3|rGYB}CUlS0W2M!V0Pdq0-VYs!Ltm0ZlAm}0w_hEy?S9%A&&rNm@MyU(Bb zwX>KKyO_pQF$+D!=($aa$%%Hq`~DlwVy4=~G^L8E@esr87dKjNwEOIve{dEv%`RqH zs+c7nVwf}IVnWgG(|`Q4vzY02G0RiMG9IQYiJyO@r<*W1$HqG zk=EoP#vEx4jUS%;s*{*PEdOJoo9H*!+WO54V!5t;rO2jRo^b{(Pl6(%?4S#5pynvT zZz!8LID=LsK@mT8&`KMqIc7wT{P?rZpp{8bM2j7?+6HQl5Rs9tZ#jc5OoAdd?4Wfv zP;)$pT=~fJ&Y+8upoj!J=u#V~+4~}YJA9cFXh^C!j$7B!H5yuB^}IGGmL=;j)9Xs~ zz}XzGVHJU+BVKcXiWu!8pWb-%C)AqzwUFhEM`>1V1U^A*VqaDx2>@mb*O5-J!q~?B2l77p*5{>(-9spl+3|ZVq-jChLafI`jnBCwS^cg5X(b z;|az>;2Flgut7EU?Ua29jwK_x;8g+{4a9(40pU#elS03ICGy2O11`w#L zthn0jSWqs9(FSax%Ebr+3&ISn$r$frLG6lYM^|tEjxgp5Hbt|FVA(665M#d=itd1d z`W!)p02<8v_4{==hGH%>(-YyLz+_A^OyJ0UcJN{ca6SLIfX5==KFKjW!5hus$bok7 z5(jWS54wQsLxX?H>@=0n|Mxn{E*3gpW&ZVOu9zk3*=Fl=>MU>e?M zJuJH$u}G&Un?Dyn45x90$5MHnC4Pdk5Grnc4VTHuY997K0%GI9P;@l3-HH*&M|&a05R?+U zpOJaFuwtY%E{Y@*t$t^uH2uyP@|htm zJ|`imiGF8jqTd;s=yyz*D@U?ny9%oc+1YT!X1w!YV&YGeF;rJERJZgF^}$+8tz21C z;GQjrjv+XX_DYR*jS#C9#J~_Ez_#{~HA_Lz;JWvNDyS^$3$(kDNA$Nf?igalvIf1Q zZ0#8AXUfvkYs9K(pa7YAD-R~Eh_bgkTBS|qs>Z&!@Z7JzJ5Z+kUV z-lwiM;N6)FhY@!xV8vd!QAJR)23yb$^rBo&SuQuIEy}myt&VVU8j*2zPc%BT^xn`+ zxz{994_kd*=P0&yA2mHA-$8`2hR?HF2t{PaIZZx7XTWMKAa$T@N9jcALK3R4U#iXx z>fET#O)(4tLQ()-U`Oc;m7?Gxew4<)19BKT95y{!_d89FfOf=0gB&>s+CdYI0Nf>^ zZ%r?G}cg3<4tUufY$rMWZ@?5i!1{ehh<*v7{Q{upX<~|ES{{Xc3bA@IU~c^(9l7FVLVS ztEaSyzV;sqa3=+?YiVjYd5Vz68#|4bwbS776dG+~uJ^I>mfHG-C;FWR`%{K~7f{|X z?y_)4+-I;+pLMu7jdduCb&6D-(s-Sqs>9zdsYfu5TGa9y+Tv%R`et+u4fs?@ zGuAYJs9ey3+B}GMJ0kmbi$C+RpBitERZu+EpfIiiJ3=#Lhp8IEkqZl1m+0+)X#ck3 z6QrQ%}*ucZFo}q9VH(Su48bhV<&DEomh$jU?pnAZcaET9g34qv3ONFXK;`}aHu;W`A@BlIOv zDFlg-TwY-HtpkHQ_i(jzk6H#|ZU68s;9J1AfNufc0=@-&3-}iBE#OB87f%)j5`7UE2fQpr}ERj3)`C>-KhW929y<+k z+TYj^l#ovy(q%l#O~^R1Mfk>Z z@44q3&-wm>d)I#UUeA8ke&1)WwSK?#76goe2?SsRZ~*`S6M(6t*2@_U0Pw~H04M;s z==##mP9By{9>$tpE|%_woKQyx#%xSihq-|Kb%WjvG+z0Fla`DO^dfJ_=hX zm%!!UMeiqn@Kn6Did99kWz`jYd8RxT3W&co;dEHOk@3~F1Sh9E#Q}@Veh+_YLo6Xs z!)V&csPt^h-EV|PdCbI~%^B^fvh9)A#=X}LCc;sJ^yezh)Mky)u#yp$@3``oSyC#)P3)K{gef1s z_lF!l3e>O^#3_2bOf+M%!k9&OL05`|kVtd{c`JkvaKMUNqbWd*lK&Fgzg!36S9z`xK{FYd9pkB9%?xNXw z3EHHT#x4?GS|gab9XpchKb?@bMfBvAu=37WGG(v3ne|MGPp<>h$py8RhK;aosab~G zTvNcZX##Oo;P-R}!}!;rs- zGFg8oNuYRK;t^^^>QN-22H>JW9k~9*lNZiz_GZq`_CL+p@0>wH*))`v|L#_-t^)1^ zkv0)rhk{px37|Yj)jTW?|6ykLA{~Q6qpOc8Sb?=)bsp|pLhj> z;-u|iei%8XH-*EV2$u8E(GUi%*)EbkNFIxVd}sS>(JldTiaSM}cuX%x22g1IoQ?TT zXinZ;nd7=KIG^1x$dkv;{UcpJ3Nr(hs{xlG8av3K%9y*aQ|xy^UJRp0 zckmQNiM#-#n-feA&)Z~AaSt1>tW>;)?b~uL3D;UO~Px3v~t{799OFrq7X7-|eGBdty%o0OhlpPwqs&aT1?$F4wS`NGoz89g-yC61D zl%u`+REP`7ZY!X|P6pTI94bM)sCmcY%zl7Ih+`SSwxtO5)@t&r64j|$(rIADHStkQ zC7r#iC~6;q{iF&Bf>uG(Bl?YNPzFE~avxG7S--v_y12kGciM0FVN+;AC-ofekm z6JK!pd5q!l$s&+?E4OvfX#9z?n9? z8eL7?8}dOIGymOdv<;=3VA|HZTzKTXz$ZI5ECqUUMN44U{g$n_Ov2&{9lKeG3qtt~ zKDklO(Cc-{!;hKRM;-?rr@Fm{6iW3;BjeW>)|)?kmLL95Qr>%$D#Uw-*vw%1sW+hk zlk3Km=vSuh8!9%q+$-?A5~bU8eaP|g?#F`SGE0CtVfwx8;pTp(C>*T3Ldzq zJV)yy&nI?d@z|Gv2frKPsK_q|?_`o~o$9$yn&3AM(y1q3t%+l?&lvN+5QC@C$S5L> zXe!3pWzeCI5Ub=c#DM(RC!~Be>^2UDj%D&15+*ddGo0ltU1rNSw8=xqxWCRk0MglV!j6zK+Zi>Ypn^@2Dw? zQIX%7*Mr?Or9!p3?la+TYSe{&l`Ph*t}kVjhK8?sW<&!QPwEs(gY?NB9b<~4HRI{) zaKtCvCRZK@{U5!!A7ledhBlAf_Zr!&_k zF-fO?vK1x&K!OmA#kDQjcZEq{FL-<3PPKD_a2DNg?rPOTSaQ#1w3t2n8+6pILA{}j zd)*nJtnx(H(f5Z!*g=ZGT=k`{N8Li4rivgohe*3lko2Hn;lQDy{3~jOI_d z>2hdblqbfsUn$wvl{arL6ZNhXX`CyWH_aG4-^aE%7|=LBjhJ=o>Y{-R+IaK5oFaCt zasOtn@UB9I%Li-uVJ@w>q$FQ>4w!3nqacOS9#4EVn+QN;p)(3BFqOL$3So0vK|Ew5 zhQKvAKTFz{O>jPHySX&(GSE6cgvKwUr~3l?3*`04?@LSaYLR?HpUwn$9UaaIa=ptz z-Ra_fu9xSbyv=ZPIix}2W4P=@wwQ$%Z4;W7>6;PY)D0?GMLb;3+3ZYu3>3m% zq@Q|3DbkwGWF>A8T-yy}%&a^QsKE(+TZ~gi#kQ?`VU#A+>Nu<$Oz;8i+g26+t#5>5 zy)Cs=K-ce74Bw}mRwi5>?V}`BUPh%?`|`59O@a}=$Ctm)jKB5b>J=%;jI+Ockn;fEFZ(D3X`XI=Nyu16|y#? zBR6iXH@=!*mLH;GO@3yT(k;6~=Y`1>HcJ0Y{9ND#ugp>~gUjRb1rj#bd0#)KTD5!f z2*rR+L5$A_YmV&;KD6cHEX@xpILY6amPiww$;z9P3THLR;&!ms%H!7i_uP{VwthRr zDzy}mRp>x&Y_ibhRJ{E?PGK{`gO@iwPhayf6HMZ+xOB$u=j3+YPTQa^j$x5yPgW08 z=c9x%!&~%}fg(U}@mIG$Jbli*&xZNJ{?>F8l}JNEbK*6W4!_6$b{}`|K!?{6xlQjp zALJ94Ln^V=FnVmjmPuH(&)O{o+A}qU<7T^FY@_<-)Jh6L3Bgr9VLX+ksRIx`vFP}L z=7-1hbLHu~P%Ev3Azt5Mj`^0-hMCr2kH}Uyk70dDN@~$M%lU|)V1?U7VQ!tBZHXTB zlKdy*u`%U(DjOFT3VPl>MXR^4%eXQOehUk5%InAwl2jua`jn?}$?|V|{B;Qv^%!(o z+hoycsPETa1{9t%cHbck?YDy7=}Ysx|FoNVFnrLQSB}{ln%c(pGCC^RRr}<4U{=kn zY6y0Epzhi)Y-NrJ8@%sX`7y@IWwa~7sd3KC^5LX%MO|ThgiW5gizZD`v5IKsbGasEdEn0N^YVIowhZwRv2tZRxv!== zu!A#nGQQ-r+wl*b(rGumiVYjkvf>atTx@{a^5vyN6*qR1JuGawg>}PJws#1SiR$7I8QRV! ziU~`MvdFd{k2NLV(Z_=n1t}$$TC<|6tG(84ZHR7#$#Y{{zcWwh65^_PEGwVv@Hvk$ z!V>gCTkjtI=6d`Bn_$ZWmNhQLcr&V-5zW!>c;VHF-m@4?X^Qya9(fZOoSK7K_?Fy6 zbLhkt%DVwJfx)s+B1tUPob?*Up39mc;eieWzcSmTctliYDMNt{X*rH%pUjf*VE@IU zrEYmz^e%{HrGGGNfw?Ed(}T!h3I+=>!}ZiqIpi^MvHmjBG{T!tv&FLUZua7nMMjNS z(oSK*bx@N+F65X%UvjA7EIGr@fOxUIY@}{-HjF>_!0e=5-w_)J=)zgR$RhPXii$5~ z>UQJ&=kE}Gvb2=BQ+!EHcf*XggkY5sEGF*vEa`|{&# zPtQ`F&pWl)3T>BTnJQDIJRwm$PKV5 zj0CfG20u$6v1u^GEdybH*Iglj>Z-E~!(WO^2;PcR_f4!5*)|vcz`;GT`P!v4gg6Ic zU~3U5i#w@%pRDq%mN5;<@9soN7QW~0zDh`U5d6+WLOKz`(6%=g2o67zE+YeeCDcV% z)+@9mrr|5zd%+))$XHLGcLeKz<@9leuZ&`~5F|0%=YmNuYKNNG!9qw5iw`qaU;U7U zpJB*itVb4#eZ6vu?S{^0g7BAt+E0h~=B3Z|np&NT$b9z^1}AS7YeJ;pmXXuWwAWO~ z9{RR=s~;j9c10N1Z{~vY7L~rbF!k|*Oa{APap^_FV=&q-15AJW6y3*JcT!yK1A z+qmrbzz2~7Yws_%c_JHiOVhk!HK<`n(7MBwzAm?_{tZc>!4>UL;Z2g}f&o@nNbJkc zppj$94rFCvG+|{lPpt(e5p^nb=EX-e`~|dsIL>&?qQP@yH9r1n;0s7kg>_&+C{n(z zXD=-xkOBUE<`ev&okvnQ7FKdf7=e@^cesIsv{<)4xwf@z`A5Ep2O8d3q9QsX6kn+R z#h0HEox6>trH4D$Z{2SWYh0X4ONwe5YsHIHn#qv{7RPg zef7oV8J!3Lb5{H&mcz(uy0w*Ph8TmT>}h5M#8U}JQU-6rU@dzQas0hzvLl4tTsOT! zX)oz?=_BjZ2sk7zgn+V)iL#QsIRU zg>`3HWalBJ>EMRBzhU9Jyv!58=iXx9f_omvE$&QkciqiS84zqtV%&T2{(48~caz?obqwRa( zVxH-@YNEUx|2D>#N4}EQgeIk+?;I&FVS@=RbL-R9lGD|-)m?^jhqD5WpQJh)K;1VT zRIqybwnnjtV1)cS!k3KGE5h5yw~6@9>FV`1k(e&mjTh{3Ba{1>VLAQx@(WuS0 zLer--_ntB;Of*BmlMI!gvT!zsxH-GHbD2B4S^m4!^1rkcYUzApB*7gZD)?T|P1fpS zp6$ZNvFD|Is>GNN!)l^@=Gs&E?vFa%ELJYqg|2XWEs1&^M*FzNdG2F%hsjaClHtae zD1#GJJ^W~2MZEXagib4MDk2Cyy)|3b3l__pUe?`OVarFw+KQclaOn03JRRO5G#@x+ zEQs)tzHphNQcO2ap__`M+az`&?{Vj7d#Xy;MF*O=N0uI)7NPLNcALKS4xIWvn>F`q z-0Q8!ps{POOf?o2_Ibtcg6W>`9fsj94h%@DH;!|1j^X%6-Vz(M^=wc%y@4{tiqs|GKvedkByI`M!O)d*Rp|Fm^Ajc zPvfDV77YC;v10J54!9+l-v+G68T+WG(o$Hs1z^cz7R!L+wAHg6vv_MwgiFUd=kKft z>{&WcTYnaqO(aI|D=KeHaACydpGe#SvgLJZsWA{CbLh_ZU0)~@7Ejj1HCDjS6E5FZ z2%9-liWct&DKJQ?+e78JwF$ z;NcH-e3pIgNQ#}Nbj%_`@v01aZX&!&O+oib*H!oFDH*$K%&=Qjyp~cy`TDVUKm>QGA~`1 zUlaTeUo`YbsKnYo7p#BZuHXBAC|^Use^v0;O4{#&Kl^+XC;n7d`&IC-)rCI`4xwC) z|G(DotDav|&VSN`_qPP}ufo4(8~+fNLM<5TC;pau{8huRxx7C#un_)ZX75)8zXl|K zD9A?T27ms-f5MYrMSu0p{tz9z^`EK#%~ShT%U{jtA36X4$2$PP-;C<7;(t8?|17>w c^C$6tABYez7HW3^0DROHfU?^k=zqTbACj%5p#T5? literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/testdtype.xlsx b/pandas/io/tests/data/testdtype.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..7c65263c373a3a43ee6e7adde9ec0a3abba219a9 GIT binary patch literal 8501 zcmeHMg8uQ5RJrrDr4gas}w+zOLeOI5ZA`*o9S4Ab1=Hd^#SIXDRpXwlU0HFfWkNcFfUO)TKv7 zTc}bpIdIJ9bf?Rx2@Bp0^nH))sS#>X+cu~5cFY~Tv5YBudJQ@z*{C~cks|JCVZ7rL zvOSO75P@xdUeOQwwwJKs$8&`R09;>V03d&(Wu+GQ_8|&u$|%<1p=fFBYH9Dr&GqB@ zFCG7fIrx{M7saZmwS)2DJMxzy$di$oXksa)=aTaEjGDgw3e!ZD5m|R=W|~=Pi8ZN% zFy#CieJ^{bW<(=4k&MTSd_`fTAd%Zu&k^v%a~F>rcUfE$9=Q}PbdWq7IT|@mmQ(a( zd)63rx3DHJQ>ka}_M_oFnNs2bt|!zuWI42O3bEt>y)G5qS+mn(v=J$dZ3MieS}>M%cE*EZy}1lc%wTeanGNN;BDa zZ%)-zq&gEH=_${ETsL#Zk)K`#-(FvuXNMq8dm(bSCoqgYZx(ez{gWgC;;$1AP%~18 zA_+AB8x3mD{U4q@cXG8eb8@o#aeDp188p-phVt^i-HOyzlsdq~jd)if;O8mN$Vn#L zxtX@Kz7k>gRIpF6F!K5y&e7bhGt?PX;>K`-d+zjiKRe+kT*M$dYQ2{iN{Z=4X1^l> zbm~9o$G|q*M$iSy0m(?WHoq2b;}O1bBdZmUMRKHrgjP@4n5jata+zcfYlq-`Pd|Y@ zc%Hg_zSWDuOkc&a|9P;+79y~sx|-n)yVc5dt@P9}*4sc&`oRM#90gGVPr%^%7sh+1 zt+L11`wf>?DxRP0TC>jaS7-~B=nDA7yy5N)AxJrTKlWI+m3PVYT00BwQWw4K%~Sgv zeTO?%%RSvAbEuX6CuH2GjF-t#aA8A*J4C2AP>}g6Sh6+xVu!)R4H}&f^W4=g8B&;w z$MuObkZ9v(#y5>wVo3|%g~TkY?4N|&H*hSMfo>|@j?m|w5$h|=(q4Wn#EoFL6;NR( zf$MVh7iT}Oe#_#-zK4c?!!m+xLjmfg)#zI(s#86uQ_qTR;;oQEJjtXWY8Q<4uo3}= zmRG7uw|w3voqK>bEhvSFh!s^?dxd4Y?hmO+G#TI4=Gkj4Q^h00B4!0)a?A;q9o>Fo z#T_s_e{3jXs$Gp;8lwo;AX0o~)#+tj6co``6*bZMGJJ~HhBLW**0CLw8JmeL_bun^ zDa%}WS{tu8O;kZ0T%)P~3Z&2is^!34dGi(q+Ex*oT#T)+*6yAE;8m%3geH z!3-}Ax0uJTegv6}X&TIi(I8jI)vDzF=M1a^ z_dWMxU8EtIVqMa}@YR|1`gia7d*2n6cfwMHc&P}@48|UN;Ttfztc{9(W$e19WP?9? zsq{8^fU69+y~Lf6)x(!W_}S*Z`HQY6?@p+->D8K&X`mfNnZ+*1!neOfP#CsaYZ>YZ z7`2&z{(~Sl8&g+H3k`QyTSsfRAA63)z)V3K7{6r?*7}^Sc5IYrnZcov*c;u)Kb^Ci zeAJyQd3{QsM7YH8%8xO4y#03NW~5lRGNHB-25x?@M|NRQ-&w^eS|@29p#zKit_(cr z?Epu4UYXKX2Fb>;p4*5CZo|i0>dBWY;z0HZWB%u2@HaFv3J4>b@?mxvbgh+u4oP7i zNCN@%vZ|W4k(|DW!s$pvd{+ei6#s}aTvx?bET{i<)pd4X_Z9G0ENRSWZdAGf0Ni5w zkre#K5_cO*2TShXuJ?ZoaZ69hiAIpP>E^m0!*RM!8FM!e(~Sg(K@ZN@1ahE?MJe{!sck{I&jx zOs83|*I;+$;A~OEjg_xCfEd1Iu{V#7=6&D$I5117FI0q3sVRt2l2T3U!LA!qpjw@G z7_rwiYD2$D7HL-1l`u#{!&f}gV?eV z-^B@VHf<7GCtrGowK^1J`x5iGWayHGw3)!mm^bRG%b|f$mKe`|sc2hU z*0erPfLtupI8`)noG^H{i)FFbr*V25G3nabNdx(4 zy`Ph(aBzxHL}>7b=Bwos?uRnXV%M8T!vmL4-Hz+a8M%kwPxI-q&-e$gN7e>>J@>Xo zZqi?$_iGS&8_qkD%x2=m*o35J_@w(gc7cnRv+u2Dt#>3n0108u(v5PGi?pOMT8SG3 z)pWraGAd5}t8auv7TqYNWZTp|GfEX|armSgg!cjM+eRfW#W(yRWOEHA$mN2P{$kW| z@r#RtU9_aii|Di}A6}NoBpCjCT)8{UxEs$eUlJ2dNGsl|cN~T|_gvtLW#49!5RiQw(+xMB3p=PqQ{}|)T{Kxl4Si;X@qKT6{gLjBmllBDY*iuh*o;kOLV$HSP zJ;w0_gG@=rVsS-pk#s#Sl*>LG+`|1_J%0$JEJz;Pp0ZD>k=7d1dB;=$#)7&k(V>O#ApU)~MZ<%U1AT7sOcDTXATc z@un>kXKA`y&PDpZq*$8ZL{`q6SU9s$7Q3CbMh?5qulu%Skac7~tJGXXW`RAavB^xQ zW6|ctYx(sEcV6DKTz$<4j4%l%aq0B!h2%Eg4%@&^j!z=<9<1)BP6r7?h7@$<0U{tT z@s~G0JbuQ!%ZB;fj$*8lQlvhiDe($=3%A?vX3u?OU%Tf4sSR?P4>HbepF(K$2|dn# z!z8rIdxb)t_C!tou<0oh%c!m?rGiXQLU5T+7)NDpbPtS6C_22S`QZWGR9V_K)JiL% zpV#LT$8>W^{X|QUdsGXY$FQ#WO-kV^%jtliV7cpAK~C*c+hRTHIk|D;p&{iuN*iYu zGCJNJ1*=Hd`Rh^)ehUkwH?N|CiBgPc=-xbjoh%pG?Wc>Ms7J5U(khEiLw%>_+`r(I zp^FMXq}K{c)syOR=W!SF$MBElypNczp((9wFJhvTU9^u5`zF=QD*Iu_d+IK|!dB+l zp&##fRD6!LavtnVaBP?|v%EK=TwYrcA7PVg?yN~uSfnCacBR7)6{=$KzJL4l6V^TM z_7dGbLN`8}53tE^qjI;a*K@c>WO?9DZPRjkJGS)k5pj=7d2(KkwqpgQ>!dGbwLRtU zKfa}1|1vJLPs@r!aDTQQYRi|K231(wPIkAjy)UdAs=6tOsM8^Uye#+n9D>Gaz(NHz z+}yeKkx6I;JHEu!&ygp^ISpFAO4P^maT+o69`tNef+c&hg{Lah)M$mVMV(hwV4qvN zwJ0&i|2?&#a%c`YHoRE{wb-7nnNp3Se!HXDi>jub~E1!Wy>aG+e-<#2!4bux35Pg!= zH&e&BlA6Zl5jTc_LkZnne5%Kp1T6i%&x|~u!!{g1sfg9IZ%{+z(8J-4PqtewZmV2t z%ZpR>183oB`s1Y@rNc)_)Un6*p_|7xhm7X5=2zlbr>vj`7S$Zy$7edd71vEmcN~I}y1LFkPj+|D)q1~Gi!0Z5 zK9s36Rm>F<#R0!7gD}UXfc$C)8mS(2k?N8b-%;6%r1nnmxs0Z)UpjPD%9jN&s)SaL zj^fP;b|&DOscNb)r)nzVnVA*}Hs$j(=eQ(do@6m3od<$n!Gd3~U17-(e;@y1lqsXa zq0jqitGK;Q9u!s>q4}mrIC~OKrk){?x@kk7$=is_q>~x{I9?15Vys1eZ~d#m+SN2` zCkc+pTnMPkm-y&~^(%WBt|G}7PqTsOyR*UPafPJe!#Iz8=(#mFgf*9CZ1ju5K=-wfz zIH_SsMew^hl9Pn*c)2a(-`Wd$Ya$_?2%&G?846GeKaegZ0e!{SMOW4Pdn4nr+SSoj)f#XyV(XukqXtpQgF+tF(=w9N<=qZYn|115ss&Y z7*}CaLAkSvU!CcF>|WS5 zH)3E7yR9s>;|@^>yLj=#;aGk%-`zS(wccmMsQV9T;cfY0lKL=*d5>1^r+i9#QGF}# z&o+6Y8gxriJ>xW}VF%FK{l%V6*UH{CNsz%MZGXXalIDy7uroOB#R7QX5V8eXoEc15 z9L!Z~hDk&p3!QlK5qw$#@9qyX9I|Ne99RtxkM}Ks^;B5<`h=q7YP)w*BLe8*7Zc;~ zy*3_6;W$|FF@6L>g4F&R65MRv_VCKqviX;M5f3oDwnSxg1Sq~x{)aC=GCDUKOG|e* z?%%rK0@l~BRiY-r#PFT?b9z+%gcDH0!KBN{@I_3spvu^KN%^HL?fa^;^OIX5c+8pc z>p=T~K-Ikd)U6qXNTVz6Q)5>vc>~n^0(1rp3ElM4RSZ zR%WP_v(DY7ILMT?jC3c*eu_HZY7xtsm}~6DcCqKYq5xYdMSKY>x8kei;V~5ipZ=4i z6Bn@AWzn{v;J&&uBK)qszpE*pSIU9LDd?TLKp)U*zE_nG#O_0-oQOkEp8R2e2U*2=+gXWIhRnt>@#Z(d{c1{JS{na=pmB3ItxU`-QubXIf>Qsj^=l%g zYGz+CoJ9p5`L1=fQIjo2Ar-~bPddzgO)$?k?T>C;$(9q!g&3_D0kgTL->L|5v;0~a zUL5#HTH~9PfG;@Synua7Xr5XftCAe6s;TNUoZ6ohXc(93s0Vjlw^PFE=vo`ZB7(Bz zR*0^C7+Ig5UUSM3*O=ehE$fO+jxJRGMgp zf+q<|pR#Z=hqyX9yK$R4xmx~SS@~C%LQS1_tfXBKm=eAd^!-MUp}Bj!K;DaIhZSum zkr6Xe`;`PEYzhKXe$9D^a{H~M2B&nqRB`~KRft8(s2{=|%-~4@fNe6-Kr!VcrJti(?{ z%jFcwg_62Ub0nAQ3+|+0mOdbw0bLW)mYmS%F^F2r+_<*er}#i0M$nrT{qfTITX!}C z9yKdps*n50Z0ZhvNW`Y=#dSvqYYhD(S^4$$z(+m7P~dXo$@T|i^Di}0Qw1F2SEwf> zsE~sAuNX9ScK#QFDD3^-Qe!2Zroi|wb}-KA@n>j+XCU|iMP~9(4oghYVymoGa?LZ# z?z}9D*&2(g8vHWhwHdJtgPzMM%+NkCK8zmhguGgL2LfW!*xfvihmOw}`jP`OYq2kNy|sYKYR`9sIqN_J`q*J`u%I<{`HT7Q8zMvrh+16$ Q02lT1M;+X6>3-b(AMX;Yj{pDw literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 49a508dd22023..9c909398d2d88 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -373,6 +373,33 @@ def test_reader_converters(self): actual = self.get_exceldf(basename, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) + def test_reader_dtype(self): + # GH 8212 + basename = 'testdtype' + actual = self.get_exceldf(basename) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + tm.assert_frame_equal(actual, expected) + + with tm.assertRaises(ValueError): + actual = self.get_exceldf(basename, dtype={'d': 'int64'}) + def test_reading_all_sheets(self): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. From 27fcd811f5b5df89eeede049cd048d94a65e7ff4 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 4 Dec 2016 12:21:44 -0500 Subject: [PATCH 123/183] BUG: multi-index HDFStore data_columns=True closes #14435 Author: Chris Closes #14791 from chris-b1/hdf-mi-datacolumns and squashes the following commits: 5d32610 [Chris] BUG: multi-index HDFStore data_columns=True --- doc/source/whatsnew/v0.19.2.txt | 2 +- pandas/io/pytables.py | 2 +- pandas/io/tests/test_pytables.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index cafbdb731f494..130653441fc0d 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -58,7 +58,7 @@ Bug Fixes - +- Bug ``HDFStore`` writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) - Bug in clipboard functions on linux with python2 with unicode and separators (:issue:`13747`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b8c2b146b6259..a5ef4e0688ea6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4254,7 +4254,7 @@ def write(self, obj, data_columns=None, **kwargs): if data_columns is None: data_columns = [] elif data_columns is True: - data_columns = obj.columns[:] + data_columns = obj.columns.tolist() obj, self.levels = self.validate_multiindex(obj) for n in self.levels: if n not in data_columns: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index aa59a74606674..69a935e07bbfc 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1818,6 +1818,19 @@ def test_select_columns_in_where(self): store.put('s', s, format='table') tm.assert_series_equal(store.select('s', where="columns=['A']"), s) + def test_mi_data_columns(self): + # GH 14435 + idx = pd.MultiIndex.from_arrays([date_range('2000-01-01', periods=5), + range(5)], names=['date', 'id']) + df = pd.DataFrame({'a': [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + + with ensure_clean_store(self.path) as store: + store.append('df', df, data_columns=True) + + actual = store.select('df', where='id == 1') + expected = df.iloc[[1], :] + tm.assert_frame_equal(actual, expected) + def test_pass_spec_to_storer(self): df = tm.makeDataFrame() From f23010aa930e4301a6e70efce92ed1afc50dfaaa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 4 Dec 2016 12:34:14 -0500 Subject: [PATCH 124/183] BUG: Bug in a groupby of a non-lexsorted MultiIndex closes #14776 Author: Jeff Reback Closes #14777 from jreback/mi_sort and squashes the following commits: cf31905 [Jeff Reback] BUG: Bug in a groupby of a non-lexsorted MultiIndex and multiple grouping levels --- doc/source/whatsnew/v0.19.2.txt | 2 +- pandas/core/groupby.py | 12 +++++++++++- pandas/tests/test_groupby.py | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 130653441fc0d..d365a3f30db25 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -36,7 +36,7 @@ Bug Fixes - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`) - +- Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ea26f5c0d29b8..f449e16686190 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -861,7 +861,17 @@ def reset_identity(values): if isinstance(result, Series): result = result.reindex(ax) else: - result = result.reindex_axis(ax, axis=self.axis) + + # this is a very unfortunate situation + # we have a multi-index that is NOT lexsorted + # and we have a result which is duplicated + # we can't reindex, so we resort to this + # GH 14776 + if isinstance(ax, MultiIndex) and not ax.is_unique: + result = result.take(result.index.get_indexer_for( + ax.values).unique(), axis=self.axis) + else: + result = result.reindex_axis(ax, axis=self.axis) elif self.group_keys: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 52d1c5c3681e0..37499e09d6dc6 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4736,6 +4736,25 @@ def test_groupby_multiindex_not_lexsorted(self): result = not_lexsorted_df.groupby('a').mean() tm.assert_frame_equal(expected, result) + # a transforming function should work regardless of sort + # GH 14776 + df = DataFrame({'x': ['a', 'a', 'b', 'a'], + 'y': [1, 1, 2, 2], + 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + self.assertFalse(df.index.is_lexsorted()) + + for level in [0, 1, [0, 1]]: + for sort in [False, True]: + result = df.groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df + tm.assert_frame_equal(expected, result) + + result = df.sort_index().groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df.sort_index() + tm.assert_frame_equal(expected, result) + def test_groupby_levels_and_columns(self): # GH9344, GH9049 idx_names = ['x', 'y'] From c0e13d1bccd4a783486eba8cc769db48a7875de8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 16 Nov 2016 22:44:11 -0800 Subject: [PATCH 125/183] BUG: Bug upon Series.Groupby.nunique with empty Series closes #12553 closes #14770 --- doc/source/whatsnew/v0.19.2.txt | 2 ++ pandas/core/groupby.py | 6 +++++- pandas/tests/test_groupby.py | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d365a3f30db25..fe900d0480d01 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -59,6 +59,8 @@ Bug Fixes - Bug ``HDFStore`` writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) +- Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`) + - Bug in clipboard functions on linux with python2 with unicode and separators (:issue:`13747`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f449e16686190..66c9e38766989 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2908,6 +2908,7 @@ def true_and_notnull(x, *args, **kwargs): def nunique(self, dropna=True): """ Returns number of unique elements in the group """ ids, _, _ = self.grouper.group_info + val = self.obj.get_values() try: @@ -2938,7 +2939,10 @@ def nunique(self, dropna=True): inc[idx] = 1 out = np.add.reduceat(inc, idx).astype('int64', copy=False) - res = out if ids[0] != -1 else out[1:] + if len(ids): + res = out if ids[0] != -1 else out[1:] + else: + res = out[1:] ri = self.grouper.result_index # we might have duplications among the bins diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 37499e09d6dc6..a2e1c5e9ff2e8 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6773,6 +6773,13 @@ def test_nunique_with_object(self): expected = pd.Series([1] * 5, name='name', index=index) tm.assert_series_equal(result, expected) + def test_nunique_with_empty_series(self): + # GH 12553 + data = pd.Series(name='name') + result = data.groupby(level=0).nunique() + expected = pd.Series(name='name', dtype='int64') + tm.assert_series_equal(result, expected) + def test_transform_with_non_scalar_group(self): # GH 10165 cols = pd.MultiIndex.from_tuples([ From 5d0e15716c5466e3a12691fb6c257734396ca451 Mon Sep 17 00:00:00 2001 From: "Brandon M. Burroughs" Date: Sun, 4 Dec 2016 15:32:42 -0500 Subject: [PATCH 126/183] ENH: support kind and na_position kwargs in Series.sort_index (#14445) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/generic.py | 3 ++- pandas/core/series.py | 12 +++++++----- pandas/tests/series/test_sorting.py | 25 +++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f172f70932d60..cfb9d8591b724 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -47,6 +47,7 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. Other enhancements ^^^^^^^^^^^^^^^^^^ +- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7868969f477b0..77f6ea063da5c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2029,7 +2029,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, DataFrames, this option is only applied when sorting on a single column or label. na_position : {'first', 'last'}, default 'last' - `first` puts NaNs at the beginning, `last` puts NaNs at the end + `first` puts NaNs at the beginning, `last` puts NaNs at the end. + Not implemented for MultiIndex. sort_remaining : bool, default True if true and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level diff --git a/pandas/core/series.py b/pandas/core/series.py index 56a3933bded3b..d381319947fbc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1770,7 +1770,7 @@ def _try_kind_sort(arr): @Appender(generic._shared_docs['sort_index'] % _shared_doc_kwargs) def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - sort_remaining=True): + kind='quicksort', na_position='last', sort_remaining=True): axis = self._get_axis_number(axis) index = self.index @@ -1780,11 +1780,13 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.groupby import _lexsort_indexer indexer = _lexsort_indexer(index.labels, orders=ascending) - indexer = _ensure_platform_int(indexer) - new_index = index.take(indexer) else: - new_index, indexer = index.sort_values(return_indexer=True, - ascending=ascending) + from pandas.core.groupby import _nargsort + indexer = _nargsort(index, kind=kind, ascending=ascending, + na_position=na_position) + + indexer = _ensure_platform_int(indexer) + new_index = index.take(indexer) new_values = self._values.take(indexer) result = self._constructor(new_values, index=new_index) diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 826201adbdb50..69e70c15cae50 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -144,3 +144,28 @@ def test_sort_index_multiindex(self): # rows share same level='A': sort has no effect without remaining lvls res = s.sort_index(level='A', sort_remaining=False) assert_series_equal(s, res) + + def test_sort_index_kind(self): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3]) + expected_series = Series(index=[1, 2, 3, 3, 4]) + + index_sorted_series = series.sort_index(kind='mergesort') + assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind='quicksort') + assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind='heapsort') + assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position(self): + series = Series(index=[3, 2, 1, 4, 3, np.nan]) + + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4]) + index_sorted_series = series.sort_index(na_position='first') + assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan]) + index_sorted_series = series.sort_index(na_position='last') + assert_series_equal(expected_series_last, index_sorted_series) From b97e007035e7bedd26a4057674e4e9591f04ae44 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Mon, 5 Dec 2016 00:57:01 -0800 Subject: [PATCH 127/183] DOC: minor format fix (#14802) --- doc/source/cookbook.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 3e84d15caf50b..a9d0ab5476b66 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -107,10 +107,8 @@ Splitting df = pd.DataFrame( {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - dflow = df[df.AAA <= 5] - dfhigh = df[df.AAA > 5] - - dflow; dfhigh + dflow = df[df.AAA <= 5]; dflow + dfhigh = df[df.AAA > 5]; dfhigh Building Criteria ***************** From b3dd9bafbfcbac1a2b6ab756e588d6e167f90439 Mon Sep 17 00:00:00 2001 From: scls19fr Date: Mon, 5 Dec 2016 15:39:14 +0100 Subject: [PATCH 128/183] ENH: add Series to_excel method (#14780) Closes #8825 --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 4 +-- pandas/core/frame.py | 59 ++------------------------------- pandas/core/generic.py | 56 +++++++++++++++++++++++++++++++ pandas/core/series.py | 16 ++++++++- pandas/io/tests/test_excel.py | 6 ++++ 6 files changed, 83 insertions(+), 59 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 638abd5421862..929664840f583 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -692,6 +692,7 @@ Serialization / IO / Conversion Series.to_pickle Series.to_csv Series.to_dict + Series.to_excel Series.to_frame Series.to_xarray Series.to_hdf diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cfb9d8591b724..9774c3ec9cc7f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,8 +50,8 @@ Other enhancements - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) +- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) .. _whatsnew_0200.api_breaking: @@ -106,4 +106,4 @@ Performance Improvements .. _whatsnew_0200.bug_fixes: Bug Fixes -~~~~~~~~~ +~~~~~~~~~ \ No newline at end of file diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf1ff28cd63b1..4cb26ab2f5886 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -105,7 +105,8 @@ axes_single_arg="{0 or 'index', 1 or 'columns'}", optional_by=""" by : str or list of str - Name or list of names which refer to the axis items.""") + Name or list of names which refer to the axis items.""", + versionadded_to_excel='') _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -1385,65 +1386,11 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, if path_or_buf is None: return formatter.path_or_buf.getvalue() + @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs) def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep='inf', verbose=True): - """ - Write DataFrame to a excel sheet - - Parameters - ---------- - excel_writer : string or ExcelWriter object - File path or existing ExcelWriter - sheet_name : string, default 'Sheet1' - Name of sheet which will contain DataFrame - na_rep : string, default '' - Missing data representation - float_format : string, default None - Format string for floating point numbers - columns : sequence, optional - Columns to write - header : boolean or list of string, default True - Write out column names. If a list of string is given it is - assumed to be aliases for the column names - index : boolean, default True - Write row names (index) - index_label : string or sequence, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. - startrow : - upper left cell row to dump data frame - startcol : - upper left cell column to dump data frame - engine : string, default None - write engine to use - you can also set this via the options - ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and - ``io.excel.xlsm.writer``. - merge_cells : boolean, default True - Write MultiIndex and Hierarchical Rows as merged cells. - encoding: string, default None - encoding of the resulting excel file. Only necessary for xlwt, - other writers support unicode natively. - inf_rep : string, default 'inf' - Representation for infinity (there is no native representation for - infinity in Excel) - - Notes - ----- - If passing an existing ExcelWriter object, then the sheet will be added - to the existing workbook. This can be used to save different - DataFrames to one workbook: - - >>> writer = ExcelWriter('output.xlsx') - >>> df1.to_excel(writer,'Sheet1') - >>> df2.to_excel(writer,'Sheet2') - >>> writer.save() - - For compatibility with to_csv, to_excel serializes lists and dicts to - strings before writing. - """ from pandas.io.excel import ExcelWriter need_save = False if encoding is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 77f6ea063da5c..b7e43d6fe01e8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1016,6 +1016,62 @@ def __setstate__(self, state): # ---------------------------------------------------------------------- # I/O Methods + _shared_docs['to_excel'] = """ + Write %(klass)s to a excel sheet + %(versionadded_to_excel)s + Parameters + ---------- + excel_writer : string or ExcelWriter object + File path or existing ExcelWriter + sheet_name : string, default 'Sheet1' + Name of sheet which will contain DataFrame + na_rep : string, default '' + Missing data representation + float_format : string, default None + Format string for floating point numbers + columns : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : + upper left cell row to dump data frame + startcol : + upper left cell column to dump data frame + engine : string, default None + write engine to use - you can also set this via the options + ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + merge_cells : boolean, default True + Write MultiIndex and Hierarchical Rows as merged cells. + encoding: string, default None + encoding of the resulting excel file. Only necessary for xlwt, + other writers support unicode natively. + inf_rep : string, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel) + + Notes + ----- + If passing an existing ExcelWriter object, then the sheet will be added + to the existing workbook. This can be used to save different + DataFrames to one workbook: + + >>> writer = ExcelWriter('output.xlsx') + >>> df1.to_excel(writer,'Sheet1') + >>> df2.to_excel(writer,'Sheet2') + >>> writer.save() + + For compatibility with to_csv, to_excel serializes lists and dicts to + strings before writing. + """ + def to_json(self, path_or_buf=None, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): diff --git a/pandas/core/series.py b/pandas/core/series.py index d381319947fbc..04aa71607d489 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -80,7 +80,8 @@ inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", unique='np.ndarray', duplicated='Series', - optional_by='') + optional_by='', + versionadded_to_excel='\n.. versionadded:: 0.20.0\n') def _coerce_method(converter): @@ -2621,6 +2622,19 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', if path is None: return result + @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs) + def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', + float_format=None, columns=None, header=True, index=True, + index_label=None, startrow=0, startcol=0, engine=None, + merge_cells=True, encoding=None, inf_rep='inf', verbose=True): + df = self.to_frame() + df.to_excel(excel_writer=excel_writer, sheet_name=sheet_name, + na_rep=na_rep, float_format=float_format, columns=columns, + header=header, index=index, index_label=index_label, + startrow=startrow, startcol=startcol, engine=engine, + merge_cells=merge_cells, encoding=encoding, + inf_rep=inf_rep, verbose=verbose) + def dropna(self, axis=0, inplace=False, **kwargs): """ Return Series without null values diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 9c909398d2d88..7a1b5655cfbf7 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1078,6 +1078,12 @@ def test_roundtrip(self): recons = read_excel(path, index_col=0) tm.assert_frame_equal(self.frame, recons) + # GH 8825 Pandas Series should provide to_excel method + s = self.frame["A"] + s.to_excel(path) + recons = read_excel(path, index_col=0) + tm.assert_frame_equal(s.to_frame(), recons) + def test_mixed(self): _skip_if_no_xlrd() From 53bf1b27c7dac2a8d72d9bcfe75f514dae8e8c96 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 5 Dec 2016 18:42:02 -0500 Subject: [PATCH 129/183] BUG: Ensure min_itemsize is always a list (#11412) closes #11412 Author: Pietro Battiston Closes #14728 from toobaz/minitemsizefix and squashes the following commits: e25cd1f [Pietro Battiston] Whatsnew b9bb88f [Pietro Battiston] Tests for previous commit 6406ee8 [Pietro Battiston] BUG: Ensure min_itemsize is always a list --- doc/source/whatsnew/v0.19.2.txt | 3 ++- pandas/io/pytables.py | 4 ++-- pandas/io/tests/test_pytables.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index fe900d0480d01..c2970b6c9a3d3 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -58,7 +58,8 @@ Bug Fixes -- Bug ``HDFStore`` writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) +- Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) +- Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`) - Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a5ef4e0688ea6..693606fdd1d32 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3315,7 +3315,7 @@ def validate_data_columns(self, data_columns, min_itemsize): # evaluate the passed data_columns, True == use all columns # take only valide axis labels if data_columns is True: - data_columns = axis_labels + data_columns = list(axis_labels) elif data_columns is None: data_columns = [] @@ -4153,7 +4153,7 @@ def write(self, obj, data_columns=None, **kwargs): obj = DataFrame({name: obj}, index=obj.index) obj.columns = [name] return super(AppendableSeriesTable, self).write( - obj=obj, data_columns=obj.columns, **kwargs) + obj=obj, data_columns=obj.columns.tolist(), **kwargs) def read(self, columns=None, **kwargs): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 69a935e07bbfc..d621797558c8f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1362,6 +1362,16 @@ def check_col(key, name, size): [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(ValueError, store.append, 'df_new', df_new) + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index('C') + store.append('ss', df['B'], min_itemsize={'index': 4}) + tm.assert_series_equal(store.select('ss'), df['B']) + + # same as above, with data_columns=True + store.append('ss2', df['B'], data_columns=True, + min_itemsize={'index': 4}) + tm.assert_series_equal(store.select('ss2'), df['B']) + # with nans _maybe_remove(store, 'df') df = tm.makeTimeDataFrame() From 4378f82967f59097055eef17ede50aa515525551 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Mon, 5 Dec 2016 20:44:16 -0500 Subject: [PATCH 130/183] BUG: Corrects stopping logic when nrows argument is supplied (#7626) closes #7626 Subsets of tabular files with different "shapes" will now load when a valid skiprows/nrows is given as an argument - Conditions for error: 1) There are different "shapes" within a tabular data file, i.e. different numbers of columns. 2) A "narrower" set of columns is followed by a "wider" (more columns) one, and the narrower set is laid out such that the end of a 262144-byte block occurs within it. Issue summary: The C engine for parsing files reads in 262144 bytes at a time. Previously, the "start_lines" variable in tokenizer.c/tokenize_bytes() was set incorrectly to the first line in that chunk, rather than the overall first row requested. This lead to incorrect logic on when to stop reading when nrows is supplied by the user. This always happened but only caused a crash when a wider set of columns followed in the file. In other cases, extra rows were read in but then harmlessly discarded. This pull request always uses the first requested row for comparisons, so only nrows will be parsed when supplied. Author: Jeff Carey Closes #14747 from jeffcarey/fix/7626 and squashes the following commits: cac1bac [Jeff Carey] Removed duplicative test 6f1965a [Jeff Carey] BUG: Corrects stopping logic when nrows argument is supplied (Fixes #7626) --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/tests/parser/c_parser_only.py | 17 +++++++++++++++++ pandas/src/parser/tokenizer.c | 8 +++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index c2970b6c9a3d3..d8db514a21392 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -70,6 +70,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index c781b0549ee60..c6ef68fcac9a0 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -371,3 +371,20 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) + + def test_read_nrows_large(self): + # gh-7626 - Read only nrows of data in for large inputs (>262144b) + header_narrow = '\t'.join(['COL_HEADER_' + str(i) + for i in range(10)]) + '\n' + data_narrow = '\t'.join(['somedatasomedatasomedata1' + for i in range(10)]) + '\n' + header_wide = '\t'.join(['COL_HEADER_' + str(i) + for i in range(15)]) + '\n' + data_wide = '\t'.join(['somedatasomedatasomedata2' + for i in range(15)]) + '\n' + test_input = (header_narrow + data_narrow * 1050 + + header_wide + data_wide * 2) + + df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010) + + self.assertTrue(df.size == 1010 * 10) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 748edc7fcacc5..450abcf6c325c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -726,16 +726,14 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit) +int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { - int i, slen, start_lines; + int i, slen; long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; - start_lines = self->lines; - if (make_stream_space(self, self->datalen - self->datapos) < 0) { self->error_msg = "out of memory"; return -1; @@ -1384,7 +1382,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); - status = tokenize_bytes(self, nrows); + status = tokenize_bytes(self, nrows, start_lines); if (status < 0) { // XXX From 6e514dacc131f044deb74f0a562a51ef3b1201eb Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 20 Nov 2016 20:55:40 -0800 Subject: [PATCH 131/183] BUG: _nsorted incorrect with duplicated values in index closes #13412 closes #14707 --- asv_bench/benchmarks/frame_methods.py | 11 ++++++++++ doc/source/whatsnew/v0.19.2.txt | 1 + pandas/core/algorithms.py | 30 +++++++++++++++++++++++++-- pandas/core/frame.py | 13 ++---------- pandas/core/series.py | 4 ++-- pandas/tests/frame/test_analytics.py | 29 ++++++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 9 ++++++++ 7 files changed, 82 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index a21dee2e612d2..df73a474b2683 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1012,3 +1012,14 @@ def setup(self): def time_frame_quantile_axis1(self): self.df.quantile([0.1, 0.5], axis=1) + + +class frame_nlargest(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_nlargest(self): + self.df.nlargest(100, 'A') diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d8db514a21392..0567a3c3fa2bb 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) - Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`) - Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`) +- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8644d4568e44d..effca6398419e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method): _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} -def select_n(series, n, keep, method): - """Implement n largest/smallest. +def select_n_series(series, n, keep, method): + """Implement n largest/smallest for pandas Series Parameters ---------- + series : pandas.Series object n : int keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} @@ -717,6 +718,31 @@ def select_n(series, n, keep, method): return dropped.iloc[inds] +def select_n_frame(frame, columns, n, method, keep): + """Implement n largest/smallest for pandas DataFrame + + Parameters + ---------- + frame : pandas.DataFrame object + columns : list or str + n : int + keep : {'first', 'last'}, default 'first' + method : str, {'nlargest', 'nsmallest'} + + Returns + ------- + nordered : DataFrame + """ + from pandas.core.series import Series + if not is_list_like(columns): + columns = [columns] + columns = list(columns) + ser = getattr(frame[columns[0]], method)(n, keep=keep) + if isinstance(ser, Series): + ser = ser.to_frame() + return ser.merge(frame, on=columns[0], left_index=True)[frame.columns] + + def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4cb26ab2f5886..0053135e1fd85 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3337,15 +3337,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, return self.sort_index(level=level, axis=axis, ascending=ascending, inplace=inplace, sort_remaining=sort_remaining) - def _nsorted(self, columns, n, method, keep): - if not is_list_like(columns): - columns = [columns] - columns = list(columns) - ser = getattr(self[columns[0]], method)(n, keep=keep) - ascending = dict(nlargest=False, nsmallest=True)[method] - return self.loc[ser.index].sort_values(columns, ascending=ascending, - kind='mergesort') - def nlargest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` largest values of `columns`. @@ -3378,7 +3369,7 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2 2 8 d NaN """ - return self._nsorted(columns, n, 'nlargest', keep) + return algos.select_n_frame(self, columns, n, 'nlargest', keep) def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest @@ -3412,7 +3403,7 @@ def nsmallest(self, n, columns, keep='first'): 0 1 a 1 2 8 d NaN """ - return self._nsorted(columns, n, 'nsmallest', keep) + return algos.select_n_frame(self, columns, n, 'nsmallest', keep) def swaplevel(self, i=-2, j=-1, axis=0): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 04aa71607d489..958cf183578dd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1940,7 +1940,7 @@ def nlargest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nlargest(10) # only sorts up to the N requested """ - return algos.select_n(self, n=n, keep=keep, method='nlargest') + return algos.select_n_series(self, n=n, keep=keep, method='nlargest') @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @@ -1978,7 +1978,7 @@ def nsmallest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nsmallest(10) # only sorts up to the N requested """ - return algos.select_n(self, n=n, keep=keep, method='nsmallest') + return algos.select_n_series(self, n=n, keep=keep, method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e73d3c58aea85..f6081e14d4081 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1323,6 +1323,35 @@ def test_nsmallest_multiple_columns(self): expected = df.sort_values(['a', 'c']).head(5) tm.assert_frame_equal(result, expected) + def test_nsmallest_nlargest_duplicate_index(self): + # GH 13412 + df = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [4, 3, 2, 1], + 'c': [0, 1, 2, 3]}, + index=[0, 0, 1, 1]) + result = df.nsmallest(4, 'a') + expected = df.sort_values('a').head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, 'a') + expected = df.sort_values('a', ascending=False).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, ['a', 'c']) + expected = df.sort_values(['a', 'c']).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, ['c', 'a']) + expected = df.sort_values(['c', 'a']).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, ['a', 'c']) + expected = df.sort_values(['a', 'c'], ascending=False).head(4) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(4, ['c', 'a']) + expected = df.sort_values(['c', 'a'], ascending=False).head(4) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Isin diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ad74b4a7e5cda..d4c209d4532e4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1532,6 +1532,15 @@ def test_nsmallest_nlargest(self): with tm.assertRaisesRegexp(ValueError, msg): s.nlargest(keep='invalid') + # GH 13412 + s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = s.nlargest(3) + expected = s.sort_values(ascending=False).head(3) + assert_series_equal(result, expected) + result = s.nsmallest(3) + expected = s.sort_values().head(3) + assert_series_equal(result, expected) + def test_sortlevel(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) From 8cdc09ef197fbb044f2811ae8fb600001d61c979 Mon Sep 17 00:00:00 2001 From: WillAyd Date: Tue, 6 Dec 2016 06:38:06 -0500 Subject: [PATCH 132/183] ENH: Allow usecols to accept callable (GH14154) (#14234) --- doc/source/io.rst | 25 ++++++++++---- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/io/parsers.py | 57 ++++++++++++++++++++++--------- pandas/io/tests/parser/usecols.py | 37 +++++++++++++++++--- pandas/parser.pyx | 25 ++++++++++---- 5 files changed, 113 insertions(+), 34 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f22374553e9c3..75f36c5274cd2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -126,13 +126,23 @@ index_col : int or sequence or ``False``, default ``None`` MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). -usecols : array-like, default ``None`` - Return a subset of the columns. All elements in this array must either +usecols : array-like or callable, default ``None`` + Return a subset of the columns. If array-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid `usecols` - parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter - results in much faster parsing time and lower memory usage. + inferred from the document header row(s). For example, a valid array-like + `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. + + If callable, the callable function will be evaluated against the column names, + returning names where the callable function evaluates to True: + + .. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) + + Using this parameter results in much faster parsing time and lower memory usage. as_recarray : boolean, default ``False`` DEPRECATED: this argument will be removed in a future version. Please call ``pd.read_csv(...).to_records()`` instead. @@ -617,7 +627,9 @@ Filtering columns (``usecols``) +++++++++++++++++++++++++++++++ The ``usecols`` argument allows you to select any subset of the columns in a -file, either using the column names or position numbers: +file, either using the column names, position numbers or a callable: + +.. versionadded:: 0.20.0 support for callable `usecols` arguments .. ipython:: python @@ -625,6 +637,7 @@ file, either using the column names or position numbers: pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=['b', 'd']) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C']) Comments and Empty Lines '''''''''''''''''''''''' diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9774c3ec9cc7f..0bfd755aae40c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -52,6 +52,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) +- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) .. _whatsnew_0200.api_breaking: @@ -106,4 +107,4 @@ Performance Improvements .. _whatsnew_0200.bug_fixes: Bug Fixes -~~~~~~~~~ \ No newline at end of file +~~~~~~~~~ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ef839297c80d3..30443f894a64d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -90,13 +90,18 @@ MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) -usecols : array-like, default None - Return a subset of the columns. All elements in this array must either +usecols : array-like or callable, default None + Return a subset of the columns. If array-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid `usecols` - parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter - results in much faster parsing time and lower memory usage. + inferred from the document header row(s). For example, a valid array-like + `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. as_recarray : boolean, default False DEPRECATED: this argument will be removed in a future version. Please call `pd.read_csv(...).to_records()` instead. @@ -977,17 +982,33 @@ def _is_index_col(col): return col is not None and col is not False +def _evaluate_usecols(usecols, names): + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return set([i for i, name in enumerate(names) + if usecols(name)]) + return usecols + + def _validate_usecols_arg(usecols): """ Check whether or not the 'usecols' parameter - contains all integers (column selection by index) - or strings (column by name). Raises a ValueError - if that is not the case. + contains all integers (column selection by index), + strings (column by name) or is a callable. Raises + a ValueError if that is not the case. """ - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") if usecols is not None: + if callable(usecols): + return usecols usecols_dtype = lib.infer_dtype(usecols) if usecols_dtype not in ('empty', 'integer', 'string', 'unicode'): @@ -1499,11 +1520,12 @@ def __init__(self, src, **kwds): self.orig_names = self.names[:] if self.usecols: - if len(self.names) > len(self.usecols): + usecols = _evaluate_usecols(self.usecols, self.orig_names) + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) - if (i in self.usecols or n in self.usecols)] + if (i in usecols or n in usecols)] - if len(self.names) < len(self.usecols): + if len(self.names) < len(usecols): raise ValueError("Usecols do not match names.") self._set_noconvert_columns() @@ -1665,9 +1687,10 @@ def read(self, nrows=None): def _filter_usecols(self, names): # hackish - if self.usecols is not None and len(names) != len(self.usecols): + usecols = _evaluate_usecols(self.usecols, names) + if usecols is not None and len(names) != len(usecols): names = [name for i, name in enumerate(names) - if i in self.usecols or name in self.usecols] + if i in usecols or name in usecols] return names def _get_index_names(self): @@ -2291,7 +2314,9 @@ def _handle_usecols(self, columns, usecols_key): usecols_key is used if there are string usecols. """ if self.usecols is not None: - if any([isinstance(col, string_types) for col in self.usecols]): + if callable(self.usecols): + col_indices = _evaluate_usecols(self.usecols, usecols_key) + elif any([isinstance(u, string_types) for u in self.usecols]): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 5051171ccb8f0..26b4b5b8ec7d1 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self): 1000,2000,3000 4000,5000,6000 """ - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): @@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self): 3.568935038,7,False,a ''' - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) @@ -366,3 +367,31 @@ def test_np_array_usecols(self): expected = DataFrame([[1, 2]], columns=usecols) result = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) + + def test_callable_usecols(self): + # See gh-14154 + s = '''AaA,bBb,CCC,ddd + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'AaA': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'bBb': {0: 8, 1: 2, 2: 7}, + 'ddd': {0: 'a', 1: 'b', 2: 'a'} + } + expected = DataFrame(data) + df = self.read_csv(StringIO(s), usecols=lambda x: + x.upper() in ['AAA', 'BBB', 'DDD']) + tm.assert_frame_equal(df, expected) + + # Check that a callable returning only False returns + # an empty DataFrame + expected = DataFrame() + df = self.read_csv(StringIO(s), usecols=lambda x: False) + tm.assert_frame_equal(df, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6760e822960f1..d94a4ef278dee 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -300,8 +300,9 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols + object usecols list dtype_cast_order - set noconvert, usecols + set noconvert def __cinit__(self, source, delimiter=b',', @@ -437,7 +438,10 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - self.usecols = set(usecols) + if callable(usecols): + self.usecols = usecols + else: + self.usecols = set(usecols) # XXX if skipfooter > 0: @@ -701,7 +705,6 @@ cdef class TextReader: cdef StringPath path = _string_path(self.c_encoding) header = [] - if self.parser.header_start >= 0: # Header is in the file @@ -821,7 +824,8 @@ cdef class TextReader: # 'data has %d fields' # % (passed_count, field_count)) - if self.has_usecols and self.allow_leading_cols: + if self.has_usecols and self.allow_leading_cols and \ + not callable(self.usecols): nuse = len(self.usecols) if nuse == passed_count: self.leading_cols = 0 @@ -1019,13 +1023,20 @@ cdef class TextReader: if i < self.leading_cols: # Pass through leading columns always name = i - elif self.usecols and nused == len(self.usecols): + elif self.usecols and not callable(self.usecols) and \ + nused == len(self.usecols): # Once we've gathered all requested columns, stop. GH5766 break else: name = self._get_column_name(i, nused) - if self.has_usecols and not (i in self.usecols or - name in self.usecols): + usecols = set() + if callable(self.usecols): + if self.usecols(name): + usecols = set([i]) + else: + usecols = self.usecols + if self.has_usecols and not (i in usecols or + name in usecols): continue nused += 1 From 0412732222df5d47ac337bd7b8e75764af56b898 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Dec 2016 08:00:30 -0500 Subject: [PATCH 133/183] COMPAT: numpy compat with 1-ndim object array compat and broadcasting (#14809) xref #14808 --- pandas/core/ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 96b447cda4bc4..7c5ad04cc90b0 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1176,6 +1176,13 @@ def na_op(x, y): yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) xrav = xrav[mask] + + # we may need to manually + # broadcast a 1 element array + if yrav.shape != mask.shape: + yrav = np.empty(mask.shape, dtype=yrav.dtype) + yrav.fill(yrav.item()) + yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): with np.errstate(all='ignore'): From 51f725f7e817df964387b3b68bdf01a07e9fb8cc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Dec 2016 08:15:08 -0500 Subject: [PATCH 134/183] BUG: fix hash collisions for from int overflow (#14805) * BUG: we don't like hash collisions in siphash xref #14767 * This should be a 64-bit int, not an 8-bit int * fix tests --- pandas/src/hash.pyx | 5 ++-- pandas/tools/tests/test_hashing.py | 37 ++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx index a393e0df96954..06ed947808e39 100644 --- a/pandas/src/hash.pyx +++ b/pandas/src/hash.pyx @@ -40,7 +40,8 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): Py_ssize_t i, l, n ndarray[uint64_t] result bytes data, k - uint8_t *kb, *lens + uint8_t *kb + uint64_t *lens char **vecs, *cdata object val @@ -55,7 +56,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): # create an array of bytes vecs = malloc(n * sizeof(char *)) - lens = malloc(n * sizeof(uint8_t)) + lens = malloc(n * sizeof(uint64_t)) cdef list datas = [] for i in range(n): diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 4e05ae7007c80..6e5f30fb7a52d 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -142,7 +142,36 @@ def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') - def test_long_strings(self): - - obj = Index(tm.rands_array(nchars=10000, size=100)) - self.check_equal(obj) + def test_same_len_hash_collisions(self): + + for l in range(8): + length = 2**(l + 8) + 1 + s = tm.rands_array(length, 2) + result = hash_array(s, 'utf8') + self.assertFalse(result[0] == result[1]) + + for l in range(8): + length = 2**(l + 8) + s = tm.rands_array(length, 2) + result = hash_array(s, 'utf8') + self.assertFalse(result[0] == result[1]) + + def test_hash_collisions(self): + + # hash collisions are bad + # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 + L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa + 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa + + # these should be different! + result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') + expected1 = np.array([14963968704024874985], dtype=np.uint64) + self.assert_numpy_array_equal(result1, expected1) + + result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') + expected2 = np.array([16428432627716348016], dtype=np.uint64) + self.assert_numpy_array_equal(result2, expected2) + + result = hash_array(np.asarray(L, dtype=object), 'utf8') + self.assert_numpy_array_equal( + result, np.concatenate([expected1, expected2], axis=0)) From 846e9e58cd0ad7c9c103d56e5e9fbbf318dea868 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 6 Dec 2016 13:43:13 -0500 Subject: [PATCH 135/183] MAINT: Cleanup pandas/src/parser (#14740) Remove dead code and reformat for style using Google's C++ style guide. Also adds Google's cpplint (fork) to the style checking for Travis. --- ci/lint.sh | 12 + pandas/src/parser/io.c | 118 +-- pandas/src/parser/io.h | 47 +- pandas/src/parser/tokenizer.c | 1831 ++++++++++++++------------------- pandas/src/parser/tokenizer.h | 166 ++- 5 files changed, 921 insertions(+), 1253 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index d6390a16b763e..7ab97bfc6d328 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -35,6 +35,18 @@ if [ "$LINT" ]; then done echo "Linting *.pxi.in DONE" + # readability/casting: Warnings about C casting instead of C++ casting + # runtime/int: Warnings about using C number types instead of C++ ones + # build/include_subdir: Warnings about prefacing included header files with directory + pip install cpplint + + echo "Linting *.c and *.h" + cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/parser + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting *.c and *.h DONE" + echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas if [ $? = "0" ]; then diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 566de72804968..562d6033ce3eb 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -1,12 +1,20 @@ -#include "io.h" +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ - /* - On-disk FILE, uncompressed - */ +#include "io.h" +/* + On-disk FILE, uncompressed +*/ void *new_file_source(char *fname, size_t buffer_size) { - file_source *fs = (file_source *) malloc(sizeof(file_source)); + file_source *fs = (file_source *)malloc(sizeof(file_source)); fs->fp = fopen(fname, "rb"); if (fs->fp == NULL) { @@ -18,7 +26,7 @@ void *new_file_source(char *fname, size_t buffer_size) { fs->initial_file_pos = ftell(fs->fp); // Only allocate this heap memory if we are not memory-mapping the file - fs->buffer = (char*) malloc((buffer_size + 1) * sizeof(char)); + fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { return NULL; @@ -27,25 +35,11 @@ void *new_file_source(char *fname, size_t buffer_size) { memset(fs->buffer, 0, buffer_size + 1); fs->buffer[buffer_size] = '\0'; - return (void *) fs; + return (void *)fs; } - -// XXX handle on systems without the capability - - -/* - * void *new_file_buffer(FILE *f, int buffer_size) - * - * Allocate a new file_buffer. - * Returns NULL if the memory allocation fails or if the call to mmap fails. - * - * buffer_size is ignored. - */ - - -void* new_rd_source(PyObject *obj) { - rd_source *rds = (rd_source *) malloc(sizeof(rd_source)); +void *new_rd_source(PyObject *obj) { + rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); /* hold on to this object */ Py_INCREF(obj); @@ -53,7 +47,7 @@ void* new_rd_source(PyObject *obj) { rds->buffer = NULL; rds->position = 0; - return (void*) rds; + return (void *)rds; } /* @@ -63,9 +57,7 @@ void* new_rd_source(PyObject *obj) { */ int del_file_source(void *fs) { - // fseek(FS(fs)->fp, FS(fs)->initial_file_pos, SEEK_SET); - if (fs == NULL) - return 0; + if (fs == NULL) return 0; /* allocated on the heap */ free(FS(fs)->buffer); @@ -89,13 +81,11 @@ int del_rd_source(void *rds) { */ - -void* buffer_file_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { file_source *src = FS(source); - *bytes_read = fread((void*) src->buffer, sizeof(char), nbytes, - src->fp); + *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); if (*bytes_read == 0) { *status = REACHED_EOF; @@ -103,13 +93,11 @@ void* buffer_file_bytes(void *source, size_t nbytes, *status = 0; } - return (void*) src->buffer; - + return (void *)src->buffer; } - -void* buffer_rd_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { PyGILState_STATE state; PyObject *result, *func, *args, *tmp; @@ -125,21 +113,18 @@ void* buffer_rd_bytes(void *source, size_t nbytes, args = Py_BuildValue("(i)", nbytes); func = PyObject_GetAttrString(src->obj, "read"); - /* printf("%s\n", PyBytes_AsString(PyObject_Repr(func))); */ /* TODO: does this release the GIL? */ result = PyObject_CallObject(func, args); Py_XDECREF(args); Py_XDECREF(func); - /* PyObject_Print(PyObject_Type(result), stdout, 0); */ if (result == NULL) { PyGILState_Release(state); *bytes_read = 0; *status = CALLING_READ_FAILED; return NULL; - } - else if (!PyBytes_Check(result)) { + } else if (!PyBytes_Check(result)) { tmp = PyUnicode_AsUTF8String(result); Py_XDECREF(result); result = tmp; @@ -154,8 +139,7 @@ void* buffer_rd_bytes(void *source, size_t nbytes, /* hang on to the Python object */ src->buffer = result; - retval = (void*) PyBytes_AsString(result); - + retval = (void *)PyBytes_AsString(result); PyGILState_Release(state); @@ -165,21 +149,18 @@ void* buffer_rd_bytes(void *source, size_t nbytes, return retval; } - #ifdef HAVE_MMAP -#include #include +#include -void *new_mmap(char *fname) -{ +void *new_mmap(char *fname) { struct stat buf; int fd; memory_map *mm; - /* off_t position; */ off_t filesize; - mm = (memory_map *) malloc(sizeof(memory_map)); + mm = (memory_map *)malloc(sizeof(memory_map)); mm->fp = fopen(fname, "rb"); fd = fileno(mm->fp); @@ -187,20 +168,19 @@ void *new_mmap(char *fname) fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); return NULL; } - filesize = buf.st_size; /* XXX This might be 32 bits. */ - + filesize = buf.st_size; /* XXX This might be 32 bits. */ if (mm == NULL) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); return NULL; } - mm->size = (off_t) filesize; + mm->size = (off_t)filesize; mm->line_number = 0; mm->fileno = fd; mm->position = ftell(mm->fp); - mm->last_pos = (off_t) filesize; + mm->last_pos = (off_t)filesize; mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); if (mm->memmap == NULL) { @@ -210,30 +190,20 @@ void *new_mmap(char *fname) mm = NULL; } - return (void*) mm; + return (void *)mm; } - -int del_mmap(void *src) -{ +int del_mmap(void *src) { munmap(MM(src)->memmap, MM(src)->size); fclose(MM(src)->fp); - - /* - * With a memory mapped file, there is no need to do - * anything if restore == RESTORE_INITIAL. - */ - /* if (restore == RESTORE_FINAL) { */ - /* fseek(FB(fb)->file, FB(fb)->current_pos, SEEK_SET); */ - /* } */ free(src); return 0; } -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { void *retval; memory_map *src = MM(source); @@ -264,19 +234,15 @@ void* buffer_mmap_bytes(void *source, size_t nbytes, /* kludgy */ -void *new_mmap(char *fname) { - return NULL; -} +void *new_mmap(char *fname) { return NULL; } -int del_mmap(void *src) { - return 0; -} +int del_mmap(void *src) { return 0; } /* don't use this! */ -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) { - return NULL; +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status) { + return NULL; } #endif diff --git a/pandas/src/parser/io.h b/pandas/src/parser/io.h index 2ae72ff8a7fe0..5a0c2b2b5e4a4 100644 --- a/pandas/src/parser/io.h +++ b/pandas/src/parser/io.h @@ -1,14 +1,23 @@ +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_PARSER_IO_H_ +#define PANDAS_SRC_PARSER_IO_H_ + #include "Python.h" #include "tokenizer.h" - typedef struct _file_source { /* The file being read. */ FILE *fp; char *buffer; - /* Size of the file, in bytes. */ - /* off_t size; */ /* file position when the file_buffer was created. */ off_t initial_file_pos; @@ -16,15 +25,9 @@ typedef struct _file_source { /* Offset in the file of the data currently in the buffer. */ off_t buffer_file_pos; - /* Actual number of bytes in the current buffer. (Can be less than buffer_size.) */ + /* Actual number of bytes in the current buffer. (Can be less than + * buffer_size.) */ off_t last_pos; - - /* Size (in bytes) of the buffer. */ - // off_t buffer_size; - - /* Pointer to the buffer. */ - // char *buffer; - } file_source; #define FS(source) ((file_source *)source) @@ -34,7 +37,6 @@ typedef struct _file_source { #endif typedef struct _memory_map { - FILE *fp; /* Size of the file, in bytes. */ @@ -49,22 +51,20 @@ typedef struct _memory_map { off_t position; off_t last_pos; char *memmap; - } memory_map; -#define MM(src) ((memory_map*) src) +#define MM(src) ((memory_map *)src) void *new_mmap(char *fname); int del_mmap(void *src); -void* buffer_mmap_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); - +void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); typedef struct _rd_source { - PyObject* obj; - PyObject* buffer; + PyObject *obj; + PyObject *buffer; size_t position; } rd_source; @@ -77,9 +77,10 @@ void *new_rd_source(PyObject *obj); int del_file_source(void *src); int del_rd_source(void *src); -void* buffer_file_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); +void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); -void* buffer_rd_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status); +void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, + int *status); +#endif // PANDAS_SRC_PARSER_IO_H_ diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 450abcf6c325c..1ea62d66345bd 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -9,61 +9,33 @@ See LICENSE for the license */ - /* - Low-level ascii-file processing for pandas. Combines some elements from - Python's built-in csv module and Warren Weckesser's textreader project on - GitHub. See Python Software Foundation License and BSD licenses for these. +/* - */ +Low-level ascii-file processing for pandas. Combines some elements from +Python's built-in csv module and Warren Weckesser's textreader project on +GitHub. See Python Software Foundation License and BSD licenses for these. +*/ #include "tokenizer.h" #include -#include #include - - -//#define READ_ERROR_OUT_OF_MEMORY 1 - - -/* -* restore: -* RESTORE_NOT (0): -* Free memory, but leave the file position wherever it -* happend to be. -* RESTORE_INITIAL (1): -* Restore the file position to the location at which -* the file_buffer was created. -* RESTORE_FINAL (2): -* Put the file position at the next byte after the -* data read from the file_buffer. -* -#define RESTORE_NOT 0 -#define RESTORE_INITIAL 1 -#define RESTORE_FINAL 2 -*/ +#include static void *safe_realloc(void *buffer, size_t size) { void *result; - // OS X is weird + // OSX is weird. // http://stackoverflow.com/questions/9560609/ // different-realloc-behaviour-in-linux-and-osx result = realloc(buffer, size); - TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) + TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, + result)) -/* if (result != NULL) { - // errno gets set to 12 on my OS Xmachine in some cases even when the - // realloc succeeds. annoying - errno = 0; - } else { - return buffer; - }*/ return result; } - void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { // column i, starting at 0 self->words = parser->words; @@ -73,7 +45,7 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { coliter_t *coliter_new(parser_t *self, int i) { // column i, starting at 0 - coliter_t *iter = (coliter_t*) malloc(sizeof(coliter_t)); + coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t)); if (NULL == iter) { return NULL; @@ -83,36 +55,28 @@ coliter_t *coliter_new(parser_t *self, int i) { return iter; } - - /* int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error); */ - /* uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); */ - - -static void free_if_not_null(void **ptr) { +static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { free(*ptr); *ptr = NULL; } - } - - - - /* +} - Parser / tokenizer +/* - */ + Parser / tokenizer +*/ -static void *grow_buffer(void *buffer, int length, int *capacity, - int space, int elsize, int *error) { +static void *grow_buffer(void *buffer, int length, int *capacity, int space, + int elsize, int *error) { int cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while ( (length + space >= cap) && (newbuffer != NULL) ){ - cap = cap? cap << 1 : 2; + while ((length + space >= cap) && (newbuffer != NULL)) { + cap = cap ? cap << 1 : 2; buffer = newbuffer; newbuffer = safe_realloc(newbuffer, elsize * cap); } @@ -122,15 +86,14 @@ static void *grow_buffer(void *buffer, int length, int *capacity, // and return the last good realloc'd buffer so it can be freed *error = errno; newbuffer = buffer; - } else { + } else { // realloc worked, update *capacity and set *error to 0 // sigh, multiple return values *capacity = cap; *error = 0; } return newbuffer; - } - +} void parser_set_default_options(parser_t *self) { self->decimal = '.'; @@ -139,7 +102,7 @@ void parser_set_default_options(parser_t *self) { // For tokenization self->state = START_RECORD; - self->delimiter = ','; // XXX + self->delimiter = ','; // XXX self->delim_whitespace = 0; self->doublequote = 0; @@ -161,17 +124,13 @@ void parser_set_default_options(parser_t *self) { self->thousands = '\0'; self->skipset = NULL; - self-> skip_first_N_rows = -1; + self->skip_first_N_rows = -1; self->skip_footer = 0; } -int get_parser_memory_footprint(parser_t *self) { - return 0; -} +int get_parser_memory_footprint(parser_t *self) { return 0; } -parser_t* parser_new() { - return (parser_t*) calloc(1, sizeof(parser_t)); -} +parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); } int parser_clear_data_buffers(parser_t *self) { free_if_not_null((void *)&self->stream); @@ -183,14 +142,14 @@ int parser_clear_data_buffers(parser_t *self) { } int parser_cleanup(parser_t *self) { - int status = 0; + int status = 0; // XXX where to put this - free_if_not_null((void *) &self->error_msg); - free_if_not_null((void *) &self->warn_msg); + free_if_not_null((void *)&self->error_msg); + free_if_not_null((void *)&self->warn_msg); if (self->skipset != NULL) { - kh_destroy_int64((kh_int64_t*) self->skipset); + kh_destroy_int64((kh_int64_t *)self->skipset); self->skipset = NULL; } @@ -207,8 +166,6 @@ int parser_cleanup(parser_t *self) { return status; } - - int parser_init(parser_t *self) { int sz; @@ -225,7 +182,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = (char*) malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = (char *)malloc(STREAM_INIT_SIZE * sizeof(char)); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -235,16 +192,16 @@ int parser_init(parser_t *self) { // word pointers and metadata sz = STREAM_INIT_SIZE / 10; - sz = sz? sz : 1; - self->words = (char**) malloc(sz * sizeof(char*)); - self->word_starts = (int*) malloc(sz * sizeof(int)); + sz = sz ? sz : 1; + self->words = (char **)malloc(sz * sizeof(char *)); + self->word_starts = (int *)malloc(sz * sizeof(int)); self->words_cap = sz; self->words_len = 0; // line pointers and metadata - self->line_start = (int*) malloc(sz * sizeof(int)); + self->line_start = (int *)malloc(sz * sizeof(int)); - self->line_fields = (int*) malloc(sz * sizeof(int)); + self->line_fields = (int *)malloc(sz * sizeof(int)); self->lines_cap = sz; self->lines = 0; @@ -253,7 +210,6 @@ int parser_init(parser_t *self) { if (self->stream == NULL || self->words == NULL || self->word_starts == NULL || self->line_start == NULL || self->line_fields == NULL) { - parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -279,7 +235,6 @@ int parser_init(parser_t *self) { return 0; } - void parser_free(parser_t *self) { // opposite of parser_init parser_cleanup(self); @@ -292,20 +247,21 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - /* TRACE(("maybe growing buffers\n")); */ - /* TOKEN STREAM */ - orig_ptr = (void *) self->stream; - TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = (char*) grow_buffer((void *) self->stream, - self->stream_len, - &self->stream_cap, nbytes * 2, - sizeof(char), &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", - self->stream, self->stream_len, self->stream_cap, status)) + orig_ptr = (void *)self->stream; + TRACE( + ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", + nbytes)) + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, + sizeof(char), &status); + TRACE( + ("make_stream_space: self->stream=%p, self->stream_len = %zu, " + "self->stream_cap=%zu, status=%zu\n", + self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -313,95 +269,86 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // realloc sets errno when moving buffer? if (self->stream != orig_ptr) { - // uff - /* TRACE(("Moving word pointers\n")) */ - self->pword_start = self->stream + self->word_start; - for (i = 0; i < self->words_len; ++i) - { + for (i = 0; i < self->words_len; ++i) { self->words[i] = self->stream + self->word_starts[i]; } } - /* WORD VECTORS */ cap = self->words_cap; - self->words = (char**) grow_buffer((void *) self->words, - self->words_len, - &self->words_cap, nbytes, - sizeof(char*), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", - self->words_len, self->words_cap, nbytes, status)) + self->words = + (char **)grow_buffer((void *)self->words, self->words_len, + &self->words_cap, nbytes, sizeof(char *), &status); + TRACE( + ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " + "%d)\n", + self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } - // realloc took place if (cap != self->words_cap) { - TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = safe_realloc((void *) self->word_starts, sizeof(int) * self->words_cap); + TRACE( + ("make_stream_space: cap != self->words_cap, nbytes = %d, " + "self->words_cap=%d\n", + nbytes, self->words_cap)) + newptr = safe_realloc((void *)self->word_starts, + sizeof(int) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int*) newptr; + self->word_starts = (int *)newptr; } } - /* LINE VECTORS */ - /* - printf("Line_start: "); - - for (j = 0; j < self->lines + 1; ++j) { - printf("%d ", self->line_fields[j]); - } - printf("\n"); - - printf("lines_cap: %d\n", self->lines_cap); - */ cap = self->lines_cap; - self->line_start = (int*) grow_buffer((void *) self->line_start, - self->lines + 1, - &self->lines_cap, nbytes, - sizeof(int), &status); - TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", - self->lines + 1, self->lines_cap, nbytes, status)) + self->line_start = + (int *)grow_buffer((void *)self->line_start, self->lines + 1, + &self->lines_cap, nbytes, sizeof(int), &status); + TRACE(( + "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc took place if (cap != self->lines_cap) { - TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = safe_realloc((void *) self->line_fields, sizeof(int) * self->lines_cap); + TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", + nbytes)) + newptr = safe_realloc((void *)self->line_fields, + sizeof(int) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int*) newptr; + self->line_fields = (int *)newptr; } } - /* TRACE(("finished growing buffers\n")); */ - return 0; } - static int push_char(parser_t *self, char c) { - /* TRACE(("pushing %c \n", c)) */ - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", + self->stream_len + 1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", - self->stream_len, self->stream_cap)) - self->error_msg = (char*) malloc(64); - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + TRACE( + ("push_char: ERROR!!! self->stream_len(%d) >= " + "self->stream_cap(%d)\n", + self->stream_len, self->stream_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } self->stream[self->stream_len++] = c; @@ -410,11 +357,15 @@ static int push_char(parser_t *self, char c) { int P_INLINE end_field(parser_t *self) { // XXX cruft -// self->numeric_field = 0; if (self->words_len >= self->words_cap) { - TRACE(("end_field: ERROR!!! self->words_len(%zu) >= self->words_cap(%zu)\n", self->words_len, self->words_cap)) - self->error_msg = (char*) malloc(64); - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + TRACE( + ("end_field: ERROR!!! self->words_len(%zu) >= " + "self->words_cap(%zu)\n", + self->words_len, self->words_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } @@ -426,8 +377,8 @@ int P_INLINE end_field(parser_t *self) { TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); - TRACE(("end_field: Saw word %s at: %d. Total: %d\n", - self->pword_start, self->word_start, self->words_len + 1)) + TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, + self->word_start, self->words_len + 1)) self->word_starts[self->words_len] = self->word_start; self->words_len++; @@ -442,29 +393,29 @@ int P_INLINE end_field(parser_t *self) { return 0; } - static void append_warning(parser_t *self, const char *msg) { int ex_length; int length = strlen(msg); void *newptr; if (self->warn_msg == NULL) { - self->warn_msg = (char*) malloc(length + 1); - strcpy(self->warn_msg, msg); + self->warn_msg = (char *)malloc(length + 1); + strncpy(self->warn_msg, msg, strlen(msg) + 1); } else { ex_length = strlen(self->warn_msg); newptr = safe_realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { - self->warn_msg = (char*) newptr; - strcpy(self->warn_msg + ex_length, msg); + self->warn_msg = (char *)newptr; + strncpy(self->warn_msg + ex_length, msg, strlen(msg) + 1); } } } static int end_line(parser_t *self) { + char *msg; int fields; int ex_fields = self->expected_fields; - char *msg; + int bufsize = 100; // for error or warning messages fields = self->line_fields[self->lines]; @@ -478,11 +429,10 @@ static int end_line(parser_t *self) { } } - if (self->state == START_FIELD_IN_SKIP_LINE || \ - self->state == IN_FIELD_IN_SKIP_LINE || \ - self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE - ) { + if (self->state == START_FIELD_IN_SKIP_LINE || + self->state == IN_FIELD_IN_SKIP_LINE || + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; @@ -495,9 +445,8 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= self->header_end + 1) - && (self->expected_fields < 0 && fields > ex_fields) - && !(self->usecols)) { + if (!(self->lines <= self->header_end + 1) && + (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -509,8 +458,9 @@ static int end_line(parser_t *self) { // file_lines is now the actual file line number (starting at 1) if (self->error_bad_lines) { - self->error_msg = (char*) malloc(100); - sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n", + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Expected %d fields in line %d, saw %d\n", ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -520,9 +470,10 @@ static int end_line(parser_t *self) { // simply skip bad lines if (self->warn_bad_lines) { // pass up error message - msg = (char*) malloc(100); - sprintf(msg, "Skipping line %d: expected %d fields, saw %d\n", - self->file_lines, ex_fields, fields); + msg = (char *)malloc(bufsize); + snprintf(msg, bufsize, + "Skipping line %d: expected %d fields, saw %d\n", + self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } @@ -530,14 +481,13 @@ static int end_line(parser_t *self) { } else { // missing trailing delimiters if ((self->lines >= self->header_end + 1) && fields < ex_fields) { - // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { self->error_msg = "out of memory"; return -1; } - while (fields < ex_fields){ + while (fields < ex_fields) { end_field(self); fields++; } @@ -549,15 +499,21 @@ static int end_line(parser_t *self) { // good line, set new start point if (self->lines >= self->lines_cap) { - TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) \ - self->error_msg = (char*) malloc(100); \ - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ + TRACE(( + "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", + self->lines, self->lines_cap)) + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - " + "possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; } - self->line_start[self->lines] = (self->line_start[self->lines - 1] + - fields); + self->line_start[self->lines] = + (self->line_start[self->lines - 1] + fields); - TRACE(("end_line: new line start: %d\n", self->line_start[self->lines])); + TRACE( + ("end_line: new line start: %d\n", self->line_start[self->lines])); // new line start with 0 fields self->line_fields[self->lines] = 0; @@ -574,10 +530,10 @@ int parser_add_skiprow(parser_t *self, int64_t row) { int ret = 0; if (self->skipset == NULL) { - self->skipset = (void*) kh_init_int64(); + self->skipset = (void *)kh_init_int64(); } - set = (kh_int64_t*) self->skipset; + set = (kh_int64_t *)self->skipset; k = kh_put_int64(set, row, &ret); set->keys[k] = row; @@ -601,18 +557,21 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); - TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", - nbytes, bytes_read, status)); + TRACE(( + "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + nbytes, bytes_read, status)); self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { - self->error_msg = (char*) malloc(200); + int bufsize = 200; + self->error_msg = (char *)malloc(bufsize); if (status == CALLING_READ_FAILED) { - sprintf(self->error_msg, ("Calling read(nbytes) on source failed. " - "Try engine='python'.")); + snprintf(self->error_msg, bufsize, + "Calling read(nbytes) on source failed. " + "Try engine='python'."); } else { - sprintf(self->error_msg, "Unknown error in IO callback"); + snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); } return -1; } @@ -622,93 +581,96 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { return status; } - /* Tokenization macros and state machine code */ -// printf("pushing %c\n", c); - -#define PUSH_CHAR(c) \ - TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= maxstreamsize) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ - self->error_msg = (char*) malloc(100); \ - sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ +#define PUSH_CHAR(c) \ + TRACE( \ + ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= maxstreamsize) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, \ + maxstreamsize)) \ + int bufsize = 100; \ + self->error_msg = (char *)malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n");\ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ slen++; // This is a little bit of a hack but works for now -#define END_FIELD() \ - self->stream_len = slen; \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ +#define END_FIELD() \ + self->stream_len = slen; \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ slen = self->stream_len; -#define END_LINE_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - \ - } - -#define END_LINE_AND_FIELD_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - \ +#define END_LINE_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } + +#define END_LINE_AND_FIELD_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ } #define END_LINE() END_LINE_STATE(START_RECORD) #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \ - (self->lineterminator != '\0' && \ - c == self->lineterminator)) +#define IS_TERMINATOR(c) \ + ((self->lineterminator == '\0' && c == '\n') || \ + (self->lineterminator != '\0' && c == self->lineterminator)) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) -#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar)) +#define IS_COMMENT_CHAR(c) \ + ((self->commentchar != '\0' && c == self->commentchar)) #define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar)) -#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ - self->skipinitialspace)) +#define IS_SKIPPABLE_SPACE(c) \ + ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((!self->delim_whitespace && c == self->delimiter) || \ - (self->delim_whitespace && IS_WHITESPACE(c))) +#define IS_DELIMITER(c) \ + ((!self->delim_whitespace && c == self->delimiter) || \ + (self->delim_whitespace && IS_WHITESPACE(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ + self->datalen)); #define CHECK_FOR_BOM() \ if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ @@ -718,16 +680,14 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != - ((kh_int64_t*)self->skipset)->n_buckets ); - } - else { - return ( rownum <= self->skip_first_N_rows ); + return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != + ((kh_int64_t *)self->skipset)->n_buckets); + } else { + return (rownum <= self->skip_first_N_rows); } } -int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) -{ +int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; long maxstreamsize; char c; @@ -749,368 +709,364 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) CHECK_FOR_BOM(); } - for (i = self->datapos; i < self->datalen; ++i) - { + for (i = self->datapos; i < self->datalen; ++i) { // next character in file c = *buf++; - TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case START_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_DELIMITER(c)) { - // Do nothing, we're starting a new field again. - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; + TRACE( + ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " + "state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); - case IN_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } - break; - - case IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - if (self->doublequote) { - self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + switch (self->state) { + case START_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. } else { self->state = IN_FIELD_IN_SKIP_LINE; } - } - break; - - case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case WHITESPACE_LINE: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; break; - } else if (!self->delim_whitespace) { - if (IS_WHITESPACE(c) && c != self->delimiter) { - ; - } else { // backtrack - // use i + 1 because buf has been incremented but not i - do { - --buf; - --i; - } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); - // reached a newline rather than the beginning - if (IS_TERMINATOR(*buf)) { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - } - // fall through - case EAT_WHITESPACE: - if (IS_TERMINATOR(c)) { - END_LINE(); - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - break; - } else if (!IS_WHITESPACE(c)) { - self->state = START_FIELD; - // fall through to subsequent state - } else { - // if whitespace char, keep slurping + case IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + if (self->doublequote) { + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + } break; - } - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { self->state = IN_FIELD_IN_SKIP_LINE; - - if (IS_TERMINATOR(c)) { - END_LINE(); - } } break; - } else if (IS_TERMINATOR(c)) { - // \n\r possible? - if (self->skip_empty_lines) { + + case WHITESPACE_LINE: + if (IS_TERMINATOR(c)) { self->file_lines++; - } else { - END_LINE(); - } - break; - } else if (IS_CARRIAGE(c)) { - if (self->skip_empty_lines) { + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; - } else { + break; + } else if (!self->delim_whitespace) { + if (IS_WHITESPACE(c) && c != self->delimiter) { + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + } + // fall through + + case EAT_WHITESPACE: + if (IS_TERMINATOR(c)) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; + break; + } else if (!IS_WHITESPACE(c)) { + self->state = START_FIELD; + // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; } - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_LINE_COMMENT; - break; - } else if (IS_WHITESPACE(c)) { - if (self->delim_whitespace) { + + case START_RECORD: + // start of record + if (skip_this_line(self, self->file_lines)) { + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + + if (IS_TERMINATOR(c)) { + END_LINE(); + } + } + break; + } else if (IS_TERMINATOR(c)) { + // \n\r possible? if (self->skip_empty_lines) { - self->state = WHITESPACE_LINE; + self->file_lines++; } else { - self->state = EAT_WHITESPACE; + END_LINE(); } break; - } else if (c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = EAT_CRNL; + } + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_LINE_COMMENT; break; + } else if (IS_WHITESPACE(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } + // fall through } - // fall through - } - // normal character - fall through - // to handle as START_FIELD - self->state = START_FIELD; + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; - case START_FIELD: - // expecting field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_QUOTE(c)) { - // start quoted field - self->state = IN_QUOTED_FIELD; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_SKIPPABLE_SPACE(c)) { - // ignore space at start of field - ; - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - // save empty field + case START_FIELD: + // expecting field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_QUOTE(c)) { + // start quoted field + self->state = IN_QUOTED_FIELD; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + PUSH_CHAR(c); + self->state = IN_FIELD; } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // begin new unquoted field - // if (self->delim_whitespace && \ - // self->quoting == QUOTE_NONNUMERIC) { - // self->numeric_field = 1; - // } + break; + case ESCAPED_CHAR: PUSH_CHAR(c); self->state = IN_FIELD; - } - break; + break; - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; + case EAT_LINE_COMMENT: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; - case EAT_LINE_COMMENT: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; + case IN_FIELD: + // in unquoted field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); - case IN_FIELD: - // in unquoted field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; } else { - self->state = START_FIELD; + // normal character - save in field + PUSH_CHAR(c); } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; + break; - case IN_QUOTED_FIELD: - // in quoted field - if (IS_ESCAPE_CHAR(c)) { - // possible escape character - self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (IS_QUOTE(c)) { - if (self->doublequote) { - // double quote - " represented by "" - self->state = QUOTE_IN_QUOTED_FIELD; + case IN_QUOTED_FIELD: + // in quoted field + if (IS_ESCAPE_CHAR(c)) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (IS_QUOTE(c)) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } else { + // end of quote part of field + self->state = IN_FIELD; + } } else { - // end of quote part of field - self->state = IN_FIELD; + // normal character - save in field + PUSH_CHAR(c); } - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - // double quote - seen a quote in an quoted field - if (IS_QUOTE(c)) { - // save "" as " + break; + case ESCAPE_IN_QUOTED_FIELD: PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, - "delimiter expected after " - "quote in quote"); - goto parsingerror; - } - break; + break; - case EAT_COMMENT: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - } - break; + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (IS_QUOTE(c)) { + // save "" as " - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - } else if (IS_DELIMITER(c)){ + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); - if (self->delim_whitespace) { - END_LINE_STATE(EAT_WHITESPACE); + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; } else { - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); + int bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "delimiter expected after quote in quote"); + goto parsingerror; } - } else { - if (self->delim_whitespace) { - /* XXX - * first character of a new record--need to back up and reread - * to handle properly... - */ - i--; buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); - } else { - // \r line terminator - // UGH. we don't actually want - // to consume the token. fix this later - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } + break; - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } + } else { + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and + * reread + * to handle properly... + */ + i--; + buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; - --i; buf--; // let's try this character again (HACK!) - if (line_limit > 0 && self->lines == start_lines + line_limit) { - goto linelimit; + --i; + buf--; // let's try this character again (HACK!) + if (line_limit > 0 && + self->lines == start_lines + line_limit) { + goto linelimit; + } } } - } - break; + break; - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - // \r line terminator -- parse this character again - if (c != '\n' && !IS_DELIMITER(c)) { - --i; - --buf; - } - break; - default: - break; + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { + --i; + --buf; + } + break; + default: + break; } } @@ -1134,39 +1090,41 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) } static int parser_handle_eof(parser_t *self) { - TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) + int bufsize = 100; - if (self->datalen != 0) - return -1; + TRACE( + ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - switch (self->state) { - case START_RECORD: - case WHITESPACE_LINE: - case EAT_CRNL_NOP: - case EAT_LINE_COMMENT: - return 0; + if (self->datalen != 0) return -1; - case ESCAPE_IN_QUOTED_FIELD: - case IN_QUOTED_FIELD: - self->error_msg = (char*)malloc(100); - sprintf(self->error_msg, "EOF inside string starting at line %d", - self->file_lines); - return -1; + switch (self->state) { + case START_RECORD: + case WHITESPACE_LINE: + case EAT_CRNL_NOP: + case EAT_LINE_COMMENT: + return 0; - case ESCAPED_CHAR: - self->error_msg = (char*)malloc(100); - sprintf(self->error_msg, "EOF following escape character"); - return -1; + case ESCAPE_IN_QUOTED_FIELD: + case IN_QUOTED_FIELD: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF inside string starting at line %d", self->file_lines); + return -1; - case IN_FIELD: - case START_FIELD: - case QUOTE_IN_QUOTED_FIELD: - if (end_field(self) < 0) + case ESCAPED_CHAR: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF following escape character"); return -1; - break; - default: - break; + case IN_FIELD: + case START_FIELD: + case QUOTE_IN_QUOTED_FIELD: + if (end_field(self) < 0) return -1; + break; + + default: + break; } if (end_line(self) < 0) @@ -1183,19 +1141,19 @@ int parser_consume_rows(parser_t *self, size_t nrows) { } /* do nothing */ - if (nrows == 0) - return 0; + if (nrows == 0) return 0; /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); - TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); + TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, + char_count)); /* move stream, only if something to move */ if (char_count < self->stream_len) { - memmove((void*) self->stream, (void*) (self->stream + char_count), + memmove((void *)self->stream, (void *)(self->stream + char_count), self->stream_len - char_count); } /* buffer counts */ @@ -1213,26 +1171,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move current word pointer to stream */ self->pword_start -= char_count; self->word_start -= char_count; - /* - printf("Line_start: "); - for (i = 0; i < self->lines + 1; ++i) { - printf("%d ", self->line_fields[i]); - } - printf("\n"); - */ + /* move line metadata */ - for (i = 0; i < self->lines - nrows + 1; ++i) - { + for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; - - /* TRACE(("First word in line %d is now %s\n", i, */ - /* self->words[self->line_start[i]])); */ - self->line_fields[i] = self->line_fields[offset]; } self->lines -= nrows; - /* self->line_fields[self->lines] = 0; */ return 0; } @@ -1256,47 +1202,50 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = safe_realloc((void*) self->words, new_cap * sizeof(char*)); + newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->words = (char**) newptr; + self->words = (char **)newptr; } - newptr = safe_realloc((void*) self->word_starts, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->word_starts, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int*) newptr; + self->word_starts = (int *)newptr; self->words_cap = new_cap; } } /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", - new_cap, self->stream_cap, self->lines_cap)); + TRACE( + ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " + "%zu\n", + new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); - newptr = safe_realloc((void*) self->stream, new_cap); + TRACE( + ("parser_trim_buffers: new_cap < self->stream_cap, calling " + "safe_realloc\n")); + newptr = safe_realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - // Update the pointers in the self->words array (char **) if `safe_realloc` - // moved the `self->stream` buffer. This block mirrors a similar block in + // Update the pointers in the self->words array (char **) if + // `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar + // block in // `make_stream_space`. if (self->stream != newptr) { - /* TRACE(("Moving word pointers\n")) */ - self->pword_start = (char*) newptr + self->word_start; + self->pword_start = (char *)newptr + self->word_start; - for (i = 0; i < self->words_len; ++i) - { - self->words[i] = (char*) newptr + self->word_starts[i]; + for (i = 0; i < self->words_len; ++i) { + self->words[i] = (char *)newptr + self->word_starts[i]; } } self->stream = newptr; self->stream_cap = new_cap; - } } @@ -1304,17 +1253,17 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void*) self->line_start, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = (int*) newptr; + self->line_start = (int *)newptr; } - newptr = safe_realloc((void*) self->line_fields, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_fields, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int*) newptr; + self->line_fields = (int *)newptr; self->lines_cap = new_cap; } } @@ -1326,12 +1275,10 @@ void debug_print_parser(parser_t *self) { int j, line; char *token; - for (line = 0; line < self->lines; ++line) - { + for (line = 0; line < self->lines; ++line) { printf("(Parsed) Line %d: ", line); - for (j = 0; j < self->line_fields[j]; ++j) - { + for (j = 0; j < self->line_fields[j]; ++j) { token = self->words[j + self->line_start[line]]; printf("%s ", token); } @@ -1339,13 +1286,6 @@ void debug_print_parser(parser_t *self) { } } -/*int clear_parsed_lines(parser_t *self, size_t nlines) { - // TODO. move data up in stream, shift relevant word pointers - - return 0; -}*/ - - /* nrows : number of rows to tokenize (or until reach EOF) all : tokenize all the data vs. certain number of rows @@ -1359,12 +1299,12 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { return 0; } - TRACE(("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", \ - (int) nrows, self->datapos, self->datalen)); + TRACE(( + "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", + (int)nrows, self->datapos, self->datalen)); while (1) { - if (!all && self->lines - start_lines >= nrows) - break; + if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize); @@ -1379,15 +1319,19 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", - self->datalen - self->datapos, self->datalen, self->datapos)); + TRACE( + ("_tokenize_helper: Trying to process %d bytes, datalen=%d, " + "datapos= %d\n", + self->datalen - self->datapos, self->datalen, self->datapos)); status = tokenize_bytes(self, nrows, start_lines); if (status < 0) { // XXX - TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, breaking\n", - status)); + TRACE( + ("_tokenize_helper: Status %d returned from tokenize_bytes, " + "breaking\n", + status)); status = -1; break; } @@ -1406,86 +1350,11 @@ int tokenize_all_rows(parser_t *self) { return status; } -/* SEL - does not look like this routine is used anywhere -void test_count_lines(char *fname) { - clock_t start = clock(); - - char *buffer, *tmp; - size_t bytes, lines = 0; - int i; - FILE *fp = fopen(fname, "rb"); - - buffer = (char*) malloc(CHUNKSIZE * sizeof(char)); - - while(1) { - tmp = buffer; - bytes = fread((void *) buffer, sizeof(char), CHUNKSIZE, fp); - // printf("Read %d bytes\n", bytes); - - if (bytes == 0) { - break; - } - - for (i = 0; i < bytes; ++i) - { - if (*tmp++ == '\n') { - lines++; - } - } - } - - - printf("Saw %d lines\n", (int) lines); - - free(buffer); - fclose(fp); - - printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC); -}*/ - - P_INLINE void uppercase(char *p) { - for ( ; *p; ++p) *p = toupper(*p); -} - -/* SEL - does not look like these routines are used anywhere -P_INLINE void lowercase(char *p) { - for ( ; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = toupper(*p); } -int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) -{ - char *p_end; - - *p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE); - if (*p_end == '\0') { - *p_imag = 0.0; - return errno == 0; - } - if (*p_end == 'i' || *p_end == 'j') { - *p_imag = *p_real; - *p_real = 0.0; - ++p_end; - } - else { - if (*p_end == '+') { - ++p_end; - } - *p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE); - if (errno || ((*p_end != 'i') && (*p_end != 'j'))) { - return FALSE; - } - ++p_end; - } - while(*p_end == ' ') { - ++p_end; - } - return *p_end == '\0'; -}*/ - - -int P_INLINE to_longlong(char *item, long long *p_value) -{ +int P_INLINE to_longlong(char *item, long long *p_value) { char *p_end; // Try integer conversion. We explicitly give the base to be 10. If @@ -1500,65 +1369,26 @@ int P_INLINE to_longlong(char *item, long long *p_value) return (errno == 0) && (!*p_end); } -/* does not look like this routine is used anywhere -int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) -{ - int i, pos, status, n = strlen(item), count = 0; - char *tmp; - char *p_end; - - for (i = 0; i < n; ++i) - { - if (*(item + i) == tsep) { - count++; - } - } - - if (count == 0) { - return to_longlong(item, p_value); - } - - tmp = (char*) malloc((n - count + 1) * sizeof(char)); - if (tmp == NULL) { - return 0; - } - - pos = 0; - for (i = 0; i < n; ++i) - { - if (item[i] != tsep) - tmp[pos++] = item[i]; - } - - tmp[pos] = '\0'; - - status = to_longlong(tmp, p_value); - free(tmp); - - return status; -}*/ - int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; + int bufsize = sizeof(char) * (strlen(item) + 1); static const char *tstrs[1] = {"TRUE"}; static const char *fstrs[1] = {"FALSE"}; - tmp = malloc(sizeof(char) * (strlen(item) + 1)); - strcpy(tmp, item); + tmp = malloc(bufsize); + strncpy(tmp, item, bufsize); uppercase(tmp); - for (i = 0; i < 1; ++i) - { + for (i = 0; i < 1; ++i) { if (strcmp(tmp, tstrs[i]) == 0) { *val = 1; goto done; } } - for (i = 0; i < 1; ++i) - { + for (i = 0; i < 1; ++i) { if (strcmp(tmp, fstrs[i]) == 0) { *val = 0; goto done; @@ -1572,27 +1402,19 @@ int to_boolean(const char *item, uint8_t *val) { return status; } -// #define TEST - #ifdef TEST -int main(int argc, char *argv[]) -{ +int main(int argc, char *argv[]) { double x, y; long long xi; int status; char *s; - //s = "0.10e-3-+5.5e2i"; - // s = "1-0j"; - // status = to_complex(s, &x, &y, 'e', '.'); s = "123,789"; status = to_longlong_thousands(s, &xi, ','); printf("s = '%s'\n", s); printf("status = %d\n", status); - printf("x = %d\n", (int) xi); - - // printf("x = %lg, y = %lg\n", x, y); + printf("x = %d\n", (int)xi); return 0; } @@ -1621,10 +1443,12 @@ int main(int argc, char *argv[]) // may be used to endorse or promote products derived from this software // without specific prior written permission. // -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -1643,197 +1467,185 @@ int main(int argc, char *argv[]) // * Add tsep argument for thousands separator // -double xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing) -{ - double number; - int exponent; - int negative; - char *p = (char *) str; - double p10; - int n; - int num_digits; - int num_decimals; - - errno = 0; - - // Skip leading whitespace - while (isspace(*p)) p++; - - // Handle optional sign - negative = 0; - switch (*p) - { - case '-': negative = 1; // Fall through to increment position - case '+': p++; - } - - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - - // Process string of digits - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - - // Process decimal part - if (*p == decimal) - { - p++; - - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } - - if (num_digits == 0) - { - errno = ERANGE; - return 0.0; - } - - // Correct for sign - if (negative) number = -number; - - // Process an exponent string - if (toupper(*p) == toupper(sci)) - { - // Handle optional sign +double xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing) { + double number; + int exponent; + int negative; + char *p = (char *)str; + double p10; + int n; + int num_digits; + int num_decimals; + + errno = 0; + + // Skip leading whitespace. + while (isspace(*p)) p++; + + // Handle optional sign. negative = 0; - switch (*++p) - { - case '-': negative = 1; // Fall through to increment pos - case '+': p++; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; } - // Process string of digits + number = 0.; + exponent = 0; num_digits = 0; - n = 0; - while (isdigit(*p)) - { - n = n * 10 + (*p - '0'); - num_digits++; - p++; + num_decimals = 0; + + // Process string of digits. + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' && *p == tsep); } - if (negative) - exponent -= n; - else - exponent += n; + // Process decimal part. + if (*p == decimal) { + p++; + + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; - } + exponent -= num_decimals; + } + if (num_digits == 0) { + errno = ERANGE; + return 0.0; + } - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) - { + // Correct for sign. + if (negative) number = -number; - errno = ERANGE; - return HUGE_VAL; - } + // Process an exponent string. + if (toupper(*p) == toupper(sci)) { + // Handle optional sign. + negative = 0; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; + } - // Scale the result - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) - { - if (n & 1) - { - if (exponent < 0) - number /= p10; - else - number *= p10; + // Process string of digits. + num_digits = 0; + n = 0; + while (isdigit(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) p--; } - n >>= 1; - p10 *= p10; - } + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { + errno = ERANGE; + return HUGE_VAL; + } - if (number == HUGE_VAL) { - errno = ERANGE; - } + // Scale the result. + p10 = 10.; + n = exponent; + if (n < 0) n = -n; + while (n) { + if (n & 1) { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } - if (skip_trailing) { - // Skip trailing whitespace - while (isspace(*p)) p++; - } + if (number == HUGE_VAL) { + errno = ERANGE; + } - if (endptr) *endptr = p; + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace(*p)) p++; + } + if (endptr) *endptr = p; - return number; + return number; } -double precise_xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing) -{ +double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing) { double number; int exponent; int negative; - char *p = (char *) str; + char *p = (char *)str; int num_digits; int num_decimals; int max_digits = 17; int n; - // Cache powers of 10 in memory - static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, - 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, - 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, - 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, - 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, - 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, - 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, - 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, - 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, - 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, - 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, - 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, - 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, - 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, - 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, - 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, - 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, - 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, - 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, - 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, - 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, - 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, - 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, - 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, - 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, - 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, - 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, - 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, - 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, - 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, - 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + // Cache powers of 10 in memory. + static double e[] = { + 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, + 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, + 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, + 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, + 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, + 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, + 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, + 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, + 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, + 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, + 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, + 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, + 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, + 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, + 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, + 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, + 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, + 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, + 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, + 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, + 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, + 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, + 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, + 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, + 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, + 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, + 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, + 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, + 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; errno = 0; - // Skip leading whitespace + // Skip leading whitespace. while (isspace(*p)) p++; - // Handle optional sign + // Handle optional sign. negative = 0; - switch (*p) - { - case '-': negative = 1; // Fall through to increment position - case '+': p++; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; } number = 0.; @@ -1841,66 +1653,59 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, num_digits = 0; num_decimals = 0; - // Process string of digits - while (isdigit(*p)) - { - if (num_digits < max_digits) - { + // Process string of digits. + while (isdigit(*p)) { + if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; - } - else + } else { ++exponent; + } p++; p += (tsep != '\0' && *p == tsep); } // Process decimal part - if (*p == decimal) - { + if (*p == decimal) { p++; - while (num_digits < max_digits && isdigit(*p)) - { + while (num_digits < max_digits && isdigit(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; num_decimals++; } - if (num_digits >= max_digits) // consume extra decimal digits - while (isdigit(*p)) - ++p; + if (num_digits >= max_digits) // Consume extra decimal digits. + while (isdigit(*p)) ++p; exponent -= num_decimals; } - if (num_digits == 0) - { + if (num_digits == 0) { errno = ERANGE; return 0.0; } - // Correct for sign + // Correct for sign. if (negative) number = -number; - // Process an exponent string - if (toupper(*p) == toupper(sci)) - { + // Process an exponent string. + if (toupper(*p) == toupper(sci)) { // Handle optional sign negative = 0; - switch (*++p) - { - case '-': negative = 1; // Fall through to increment pos - case '+': p++; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; } - // Process string of digits + // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) - { + while (isdigit(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1911,33 +1716,28 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, else exponent += n; - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; + // If no digits after the 'e'/'E', un-consume it. + if (num_digits == 0) p--; } - if (exponent > 308) - { + if (exponent > 308) { errno = ERANGE; return HUGE_VAL; - } - else if (exponent > 0) + } else if (exponent > 0) { number *= e[exponent]; - else if (exponent < -308) // subnormal - { - if (exponent < -616) // prevent invalid array access + } else if (exponent < -308) { // Subnormal + if (exponent < -616) // Prevent invalid array access. number = 0.; number /= e[-308 - exponent]; number /= e[308]; - } - else + } else { number /= e[-exponent]; + } - if (number == HUGE_VAL || number == -HUGE_VAL) - errno = ERANGE; + if (number == HUGE_VAL || number == -HUGE_VAL) errno = ERANGE; if (skip_trailing) { - // Skip trailing whitespace + // Skip trailing whitespace. while (isspace(*p)) p++; } @@ -1945,9 +1745,8 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } -double round_trip(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) -{ +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing) { #if PY_VERSION_HEX >= 0x02070000 return PyOS_string_to_double(p, q, 0); #else @@ -1955,31 +1754,12 @@ double round_trip(const char *p, char **q, char decimal, char sci, #endif } -/* -float strtof(const char *str, char **endptr) -{ - return (float) strtod(str, endptr); -} - - -long double strtold(const char *str, char **endptr) -{ - return strtod(str, endptr); -} - -double atof(const char *str) -{ - return strtod(str, NULL); -} -*/ - // End of xstrtod code // --------------------------------------------------------------------------- int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, - int *error, char tsep) -{ - const char *p = (const char *) p_item; + int *error, char tsep) { + const char *p = (const char *)p_item; int isneg = 0; int64_t number = 0; int d; @@ -1993,8 +1773,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (*p == '-') { isneg = 1; ++p; - } - else if (*p == '+') { + } else if (*p == '+') { p++; } @@ -2023,11 +1802,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2036,25 +1813,20 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, while (isdigit(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } } } - } - else { + } else { // If number is less than pre_max, at least one more digit // can be processed without overflowing. int64_t pre_max = int_max / 10; int dig_pre_max = int_max % 10; - //printf("pre_max = %lld dig_pre_max = %d\n", pre_max, dig_pre_max); - // Process the digits. d = *p; if (tsep != '\0') { @@ -2067,12 +1839,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2081,12 +1851,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, while (isdigit(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); d = *++p; - } - else { + } else { *error = ERROR_OVERFLOW; return 0; } @@ -2108,66 +1876,3 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, *error = 0; return number; } - -/* does not look like this routine is used anywhere -uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) -{ - int d, dig_pre_max; - uint64_t pre_max; - const char *p = (const char *) p_item; - uint64_t number = 0; - - // Skip leading spaces. - while (isspace(*p)) { - ++p; - } - - // Handle sign. - if (*p == '-') { - *error = ERROR_MINUS_SIGN; - return 0; - } - if (*p == '+') { - p++; - } - - // Check that there is a first digit. - if (!isdigit(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - pre_max = uint_max / 10; - dig_pre_max = uint_max % 10; - - // Process the digits. - d = *p; - while (isdigit(d)) { - if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - } - else { - *error = ERROR_OVERFLOW; - return 0; - } - } - - // Skip trailing spaces. - while (isspace(*p)) { - ++p; - } - - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } - - *error = 0; - return number; -} -*/ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 487c1265d9358..e01812f1c5520 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -9,29 +9,29 @@ See LICENSE for the license */ -#ifndef _PARSER_COMMON_H_ -#define _PARSER_COMMON_H_ +#ifndef PANDAS_SRC_PARSER_TOKENIZER_H_ +#define PANDAS_SRC_PARSER_TOKENIZER_H_ -#include "Python.h" +#include #include -#include #include +#include #include -#include +#include "Python.h" #include -#define ERROR_OK 0 -#define ERROR_NO_DIGITS 1 -#define ERROR_OVERFLOW 2 -#define ERROR_INVALID_CHARS 3 -#define ERROR_MINUS_SIGN 4 +#define ERROR_OK 0 +#define ERROR_NO_DIGITS 1 +#define ERROR_OVERFLOW 2 +#define ERROR_INVALID_CHARS 3 +#define ERROR_MINUS_SIGN 4 #include "../headers/stdint.h" #include "khash.h" -#define CHUNKSIZE 1024*256 +#define CHUNKSIZE 1024 * 256 #define KB 1024 #define MB 1024 * KB #define STREAM_INIT_SIZE 32 @@ -40,15 +40,15 @@ See LICENSE for the license #define CALLING_READ_FAILED 2 #ifndef P_INLINE - #if defined(__GNUC__) - #define P_INLINE static __inline__ - #elif defined(_MSC_VER) - #define P_INLINE - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define P_INLINE static inline - #else - #define P_INLINE - #endif +#if defined(__GNUC__) +#define P_INLINE static __inline__ +#elif defined(_MSC_VER) +#define P_INLINE +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define P_INLINE static inline +#else +#define P_INLINE +#endif #endif #if defined(_MSC_VER) @@ -62,41 +62,34 @@ See LICENSE for the license */ #define FALSE 0 -#define TRUE 1 - -/* Maximum number of columns in a file. */ -#define MAX_NUM_COLUMNS 2000 +#define TRUE 1 -/* Maximum number of characters in single field. */ - -#define FIELD_BUFFER_SIZE 2000 +// Maximum number of columns in a file. +#define MAX_NUM_COLUMNS 2000 +// Maximum number of characters in single field. +#define FIELD_BUFFER_SIZE 2000 /* * Common set of error types for the read_rows() and tokenize() * functions. */ - -#define ERROR_OUT_OF_MEMORY 1 -#define ERROR_INVALID_COLUMN_INDEX 10 +#define ERROR_OUT_OF_MEMORY 1 +#define ERROR_INVALID_COLUMN_INDEX 10 #define ERROR_CHANGED_NUMBER_OF_FIELDS 12 -#define ERROR_TOO_MANY_CHARS 21 -#define ERROR_TOO_MANY_FIELDS 22 -#define ERROR_NO_DATA 23 - - -/* #define VERBOSE */ +#define ERROR_TOO_MANY_CHARS 21 +#define ERROR_TOO_MANY_FIELDS 22 +#define ERROR_NO_DATA 23 +// #define VERBOSE #if defined(VERBOSE) #define TRACE(X) printf X; #else #define TRACE(X) #endif - #define PARSER_OUT_OF_MEMORY -1 - /* * XXX Might want to couple count_rows() with read_rows() to avoid duplication * of some file I/O. @@ -108,7 +101,6 @@ See LICENSE for the license */ #define WORD_BUFFER_SIZE 4000 - typedef enum { START_RECORD, START_FIELD, @@ -131,12 +123,14 @@ typedef enum { } ParserState; typedef enum { - QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE + QUOTE_MINIMAL, + QUOTE_ALL, + QUOTE_NONNUMERIC, + QUOTE_NONE } QuoteStyle; - -typedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status); +typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + int *status); typedef int (*io_cleanup)(void *src); typedef struct parser_t { @@ -156,38 +150,38 @@ typedef struct parser_t { // Store words in (potentially ragged) matrix for now, hmm char **words; - int *word_starts; // where we are in the stream + int *word_starts; // where we are in the stream int words_len; int words_cap; - char *pword_start; // pointer to stream start of current field - int word_start; // position start of current field + char *pword_start; // pointer to stream start of current field + int word_start; // position start of current field - int *line_start; // position in words for start of line - int *line_fields; // Number of fields in each line - int lines; // Number of (good) lines observed - int file_lines; // Number of file lines observed (including bad or skipped) - int lines_cap; // Vector capacity + int *line_start; // position in words for start of line + int *line_fields; // Number of fields in each line + int lines; // Number of (good) lines observed + int file_lines; // Number of file lines observed (including bad or skipped) + int lines_cap; // Vector capacity // Tokenizing stuff ParserState state; - int doublequote; /* is " represented by ""? */ - char delimiter; /* field separator */ - int delim_whitespace; /* delimit by consuming space/tabs instead */ - char quotechar; /* quote character */ - char escapechar; /* escape character */ + int doublequote; /* is " represented by ""? */ + char delimiter; /* field separator */ + int delim_whitespace; /* delimit by consuming space/tabs instead */ + char quotechar; /* quote character */ + char escapechar; /* escape character */ char lineterminator; - int skipinitialspace; /* ignore spaces following delimiter? */ - int quoting; /* style of quoting to write */ + int skipinitialspace; /* ignore spaces following delimiter? */ + int quoting; /* style of quoting to write */ // krufty, hmm =/ int numeric_field; char commentchar; int allow_embedded_newline; - int strict; /* raise exception on bad CSV */ + int strict; /* raise exception on bad CSV */ - int usecols; // Boolean: 1: usecols provided, 0: none provided + int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; int error_bad_lines; @@ -200,9 +194,9 @@ typedef struct parser_t { // thousands separator (comma, period) char thousands; - int header; // Boolean: 1: has header, 0: no header - int header_start; // header row start - int header_end; // header row end + int header; // Boolean: 1: has header, 0: no header + int header_start; // header row start + int header_end; // header row end void *skipset; int64_t skip_first_N_rows; @@ -216,7 +210,6 @@ typedef struct parser_t { int skip_empty_lines; } parser_t; - typedef struct coliter_t { char **words; int *line_start; @@ -226,15 +219,13 @@ typedef struct coliter_t { void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); coliter_t *coliter_new(parser_t *self, int i); -/* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ -// #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] +#define COLITER_NEXT(iter, word) \ + do { \ + const int i = *iter.line_start++ + iter.col; \ + word = i < *iter.line_start ? iter.words[i] : ""; \ + } while (0) -#define COLITER_NEXT(iter, word) do { \ - const int i = *iter.line_start++ + iter.col; \ - word = i < *iter.line_start ? iter.words[i]: ""; \ - } while(0) - -parser_t* parser_new(void); +parser_t *parser_new(void); int parser_init(parser_t *self); @@ -256,24 +247,17 @@ int tokenize_nrows(parser_t *self, size_t nrows); int tokenize_all_rows(parser_t *self); -/* - - Have parsed / type-converted a chunk of data and want to free memory from the - token stream - - */ -//int clear_parsed_lines(parser_t *self, size_t nlines); - -int64_t str_to_int64(const char *p_item, int64_t int_min, - int64_t int_max, int *error, char tsep); -//uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); - -double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); -//int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); -//int P_INLINE to_longlong(char *item, long long *p_value); -//int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); +// Have parsed / type-converted a chunk of data +// and want to free memory from the token stream + +int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, + int *error, char tsep); +double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing); +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing); int to_boolean(const char *item, uint8_t *val); -#endif // _PARSER_COMMON_H_ +#endif // PANDAS_SRC_PARSER_TOKENIZER_H_ From 1725d24639a7c350a48fd201ca71b4548ea7186b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 6 Dec 2016 14:10:13 -0500 Subject: [PATCH 136/183] TST: Test pivot with categorical data closes #8860 closes #8731 closes #9534 Looks like the examples in these issues pass in the master branch. Added test to confirm. Author: Matt Roeschke Closes #14807 from mroeschke/test_pivot_categoricals and squashes the following commits: b506083 [Matt Roeschke] TST: Test pivot with categorical data --- pandas/tools/tests/test_pivot.py | 69 ++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 5944fa1b34611..26f80f463d609 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -854,6 +854,53 @@ def test_categorical_margins(self): table = data.pivot_table('x', 'y', 'z', margins=True) tm.assert_frame_equal(table, expected) + def test_categorical_aggfunc(self): + # GH 9534 + df = pd.DataFrame({"C1": ["A", "B", "C", "C"], + "C2": ["a", "a", "b", "b"], + "V": [1, 2, 3, 4]}) + df["C1"] = df["C1"].astype("category") + result = df.pivot_table("V", index="C1", columns="C2", aggfunc="count") + + expected_index = pd.CategoricalIndex(['A', 'B', 'C'], + categories=['A', 'B', 'C'], + ordered=False, + name='C1') + expected_columns = pd.Index(['a', 'b'], name='C2') + expected_data = np.array([[1., np.nan], + [1., np.nan], + [np.nan, 2.]]) + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + tm.assert_frame_equal(result, expected) + + def test_categorical_pivot_index_ordering(self): + # GH 8731 + df = pd.DataFrame({'Sales': [100, 120, 220], + 'Month': ['January', 'January', 'January'], + 'Year': [2013, 2014, 2013]}) + months = ['January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', + 'December'] + df['Month'] = df['Month'].astype('category').cat.set_categories(months) + result = df.pivot_table(values='Sales', + index='Month', + columns='Year', + aggfunc='sum') + expected_columns = pd.Int64Index([2013, 2014], name='Year') + expected_index = pd.CategoricalIndex(months, + categories=months, + ordered=False, + name='Month') + expected_data = np.empty((12, 2)) + expected_data.fill(np.nan) + expected_data[0, :] = [320., 120.] + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + tm.assert_frame_equal(result, expected) + class TestCrosstab(tm.TestCase): @@ -1212,6 +1259,28 @@ def test_crosstab_errors(self): with tm.assertRaisesRegexp(ValueError, error): pd.crosstab(df.a, df.b, normalize='all', margins=42) + def test_crosstab_with_categorial_columns(self): + # GH 8860 + df = pd.DataFrame({'MAKE': ['Honda', 'Acura', 'Tesla', + 'Honda', 'Honda', 'Acura'], + 'MODEL': ['Sedan', 'Sedan', 'Electric', + 'Pickup', 'Sedan', 'Sedan']}) + categories = ['Sedan', 'Electric', 'Pickup'] + df['MODEL'] = (df['MODEL'].astype('category') + .cat.set_categories(categories)) + result = pd.crosstab(df['MAKE'], df['MODEL']) + + expected_index = pd.Index(['Acura', 'Honda', 'Tesla'], name='MAKE') + expected_columns = pd.CategoricalIndex(categories, + categories=categories, + ordered=False, + name='MODEL') + expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_columns) + tm.assert_frame_equal(result, expected) + if __name__ == '__main__': import nose From 2466ecbb717d8cdfd30cc20d5d22e5e095d9a14d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Dec 2016 18:18:25 -0500 Subject: [PATCH 137/183] BLD: try new build credentials for pandas-docs --- .travis.yml | 2 +- ci/build_docs.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a0c6d77fcf45..7de67476f5ec4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ env: global: # pandas-docs-travis GH - - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" + - secure: "UJK7kUtkcnV9PFP4IBXAvgmRQKdwARlfqF4UZQ5tBwrpnD1a3n7FLBijcuXQ3jkvwpEc/FZB9RJDXmsqYXJPvq3BC++2Cv2tFDvKr/c+y8KffszAyVk47jKEHMNmGgauwaNMggsE/rH8YHe4so9LsJHTRbzmLo8lXPNTldoIu5s=" git: # for cloning diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 8594cd4af34e5..fac9edbab89d3 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -43,7 +43,9 @@ if [ x"$DOC_BUILD" != x"" ]; then cd /tmp/doc/build/html git config --global user.email "pandas-docs-bot@localhost.foo" git config --global user.name "pandas-docs-bot" + git config --global credential.helper cache + # create the repo git init touch README git add README @@ -53,7 +55,7 @@ if [ x"$DOC_BUILD" != x"" ]; then touch .nojekyll git add --all . git commit -m "Version" --allow-empty - git remote add origin https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis + git remote add origin "https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis" git push origin gh-pages -f fi From 3ac41ab2d7ae446fa04f47ec911003cd722dbd65 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 8 Dec 2016 08:41:52 -0800 Subject: [PATCH 138/183] DOC: Fix grammar and formatting typos (#14803) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 24 +++++++++++++++--------- pandas/core/ops.py | 2 +- pandas/core/series.py | 4 ++-- pandas/tools/plotting.py | 5 +++-- pandas/tseries/index.py | 2 +- 6 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0053135e1fd85..0d4bcd781cf74 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2483,7 +2483,7 @@ def assign(self, **kwargs): Notes ----- Since ``kwargs`` is a dictionary, the order of your - arguments may not be preserved. The make things predicatable, + arguments may not be preserved. To make things predicatable, the columns are inserted in alphabetical order, at the end of your DataFrame. Assigning multiple columns within the same ``assign`` is possible, but you cannot reference other columns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b7e43d6fe01e8..64e3d60e1fe14 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3635,14 +3635,17 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, require that you also specify an `order` (int), e.g. df.interpolate(method='polynomial', order=4). These use the actual numerical values of the index. - * 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' are all - wrappers around the scipy interpolation methods of similar - names. These use the actual numerical values of the index. See - the scipy documentation for more on their behavior - `here `__ # noqa - `and here `__ # noqa + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + are all wrappers around the scipy interpolation methods of + similar names. These use the actual numerical values of the + index. For more information on their behavior, see the + `scipy documentation + `__ + and `tutorial documentation + `__ * 'from_derivatives' refers to BPoly.from_derivatives which - replaces 'piecewise_polynomial' interpolation method in scipy 0.18 + replaces 'piecewise_polynomial' interpolation method in + scipy 0.18 .. versionadded:: 0.18.1 @@ -3656,7 +3659,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 1: fill row-by-row limit : int, default None. Maximum number of consecutive NaNs to fill. - limit_direction : {'forward', 'backward', 'both'}, defaults to 'forward' + limit_direction : {'forward', 'backward', 'both'}, default 'forward' If limit is specified, consecutive NaNs will be filled in this direction. @@ -4159,6 +4162,9 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 + Notes + ----- + To learn more about the offset strings, please see `this link `__. @@ -4346,7 +4352,7 @@ def rank(self, axis=0, method='average', numeric_only=None, Parameters ---------- - axis: {0 or 'index', 1 or 'columns'}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 index to direct ranking method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 7c5ad04cc90b0..80de3cd85d4db 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1006,7 +1006,7 @@ def wrapper(self, other): Parameters ---------- -other: Series or scalar value +other : Series or scalar value fill_value : None or float value, default None (NaN) Fill missing (NaN) values with this value. If both Series are missing, the result will be missing diff --git a/pandas/core/series.py b/pandas/core/series.py index 958cf183578dd..7018865e5b3ec 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2033,9 +2033,9 @@ def reorder_levels(self, order): Parameters ---------- - order: list of int representing new level order. + order : list of int representing new level order. (reference level by number or key) - axis: where to reorder levels + axis : where to reorder levels Returns ------- diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index d46dc4d355b4c..e4cf896a89f57 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2893,8 +2893,9 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, invisible figsize : tuple The size of the figure to create in inches by default - layout: (optional) a tuple (rows, columns) for the layout of the histograms - bins: integer, default 10 + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms + bins : integer, default 10 Number of histogram bins to be used kwds : other plotting keyword arguments To be passed to hist function diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 0824072cc383f..3edf75fbb82ae 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -2000,7 +2000,7 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, Frequency strings can have multiples, e.g. '5H' tz : string or None Time zone name for returning localized DatetimeIndex, for example - Asia/Hong_Kong + Asia/Hong_Kong normalize : bool, default False Normalize start/end dates to midnight before generating date range name : str, default None From 5f057cbe2d35965c90a80b10b285bd1c88d7a523 Mon Sep 17 00:00:00 2001 From: bmagnusson Date: Fri, 9 Dec 2016 00:32:33 -0800 Subject: [PATCH 139/183] ENH: Add the ability to have a separate title for each subplot when plotting (#14753) * Add logic such that if 'title' is a list and 'subplots' is True, use each item of the list as the title of the individual subplots. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/plotting/test_misc.py | 29 ++++++++++++++++++++++++++--- pandas/tools/plotting.py | 25 ++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0bfd755aae40c..aeafc76876bbd 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -53,6 +53,7 @@ Other enhancements - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) +- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index a484217da5969..6c313f5937602 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -16,13 +16,11 @@ from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, _ok_for_gaussian_kde) - """ Test cases for misc plot functions """ @tm.mplskip class TestSeriesPlots(TestPlotBase): - def setUp(self): TestPlotBase.setUp(self) import matplotlib as mpl @@ -54,7 +52,6 @@ def test_bootstrap_plot(self): @tm.mplskip class TestDataFramePlots(TestPlotBase): - @slow def test_scatter_plot_legacy(self): tm._skip_if_no_scipy() @@ -277,6 +274,32 @@ def test_radviz(self): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors) + @slow + def test_subplot_titles(self): + df = self.iris.drop('Name', axis=1).head() + # Use the column names as the subplot titles + title = list(df.columns) + + # Case len(title) == len(df) + plot = df.plot(subplots=True, title=title) + self.assertEqual([p.get_title() for p in plot], title) + + # Case len(title) > len(df) + self.assertRaises(ValueError, df.plot, subplots=True, + title=title + ["kittens > puppies"]) + + # Case len(title) < len(df) + self.assertRaises(ValueError, df.plot, subplots=True, title=title[:2]) + + # Case subplots=False and title is of type list + self.assertRaises(ValueError, df.plot, subplots=False, title=title) + + # Case df with 3 numeric columns but layout of (2,2) + plot = df.drop('SepalWidth', axis=1).plot(subplots=True, layout=(2, 2), + title=title[:-1]) + title_list = [ax.get_title() for sublist in plot for ax in sublist] + self.assertEqual(title_list, title[:3] + ['']) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e4cf896a89f57..21e8b64a3656a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1217,8 +1217,25 @@ def _adorn_subplots(self): if self.title: if self.subplots: - self.fig.suptitle(self.title) + if is_list_like(self.title): + if len(self.title) != self.nseries: + msg = ('The length of `title` must equal the number ' + 'of columns if using `title` of type `list` ' + 'and `subplots=True`.\n' + 'length of title = {}\n' + 'number of columns = {}').format( + len(self.title), self.nseries) + raise ValueError(msg) + + for (ax, title) in zip(self.axes, self.title): + ax.set_title(title) + else: + self.fig.suptitle(self.title) else: + if is_list_like(self.title): + msg = ('Using `title` of type `list` is not supported ' + 'unless `subplots=True` is passed') + raise ValueError(msg) self.axes[0].set_title(self.title) def _apply_axis_properties(self, axis, rot=None, fontsize=None): @@ -2555,8 +2572,10 @@ def _plot(data, x=None, y=None, subplots=False, figsize : a tuple (width, height) in inches use_index : boolean, default True Use index as ticks for x axis - title : string - Title to use for the plot + title : string or list + Title to use for the plot. If a string is passed, print the string at + the top of the figure. If a list is passed and `subplots` is True, + print each item in the list above the corresponding subplot. grid : boolean, default None (matlab style default) Axis grid lines legend : False/True/'reverse' From 36bb8afb6f98dc19558c5ea32362dd033384ff25 Mon Sep 17 00:00:00 2001 From: "Dr. Irv" Date: Fri, 9 Dec 2016 11:36:29 -0500 Subject: [PATCH 140/183] ENH: Introduce UnsortedIndexError GH11897 (#14762) --- doc/source/advanced.rst | 4 +++- doc/source/whatsnew/v0.20.0.txt | 8 ++++++++ pandas/core/common.py | 10 ++++++++++ pandas/core/indexing.py | 4 +++- pandas/indexes/multi.py | 10 ++++++---- pandas/tests/indexes/test_multi.py | 18 +++++++++++++++++- pandas/tests/indexing/test_indexing.py | 14 +++++++++----- 7 files changed, 56 insertions(+), 12 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 0c843dd39b56f..7b6b2a09f6037 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -528,12 +528,14 @@ return a copy of the data rather than a view: jim joe 1 z 0.64094 +.. _advanced.unsorted: + Furthermore if you try to index something that is not fully lexsorted, this can raise: .. code-block:: ipython In [5]: dfm.loc[(0,'y'):(1, 'z')] - KeyError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' + UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and the ``lexsort_depth`` property returns the sort depth: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index aeafc76876bbd..d89103309e990 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,6 +50,11 @@ Other enhancements - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) + +- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an + unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack + of sorting or an incorrect key. See :ref:`here ` + - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) @@ -71,6 +76,9 @@ Backwards incompatible API changes Other API Changes ^^^^^^^^^^^^^^^^^ +- Change error message text when indexing via a + boolean ``Series`` that has an incompatible index (:issue:`14491`) + .. _whatsnew_0200.deprecations: Deprecations diff --git a/pandas/core/common.py b/pandas/core/common.py index 295947bbc1166..fddac1f29d454 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -97,6 +97,16 @@ class UnsupportedFunctionCall(ValueError): pass +class UnsortedIndexError(KeyError): + """ Error raised when attempting to get a slice of a MultiIndex + and the index has not been lexsorted. Subclass of `KeyError`. + + .. versionadded:: 0.20.0 + + """ + pass + + class AbstractMethodError(NotImplementedError): """Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 660e8c9446202..c4ae3dcca8367 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1814,7 +1814,9 @@ def check_bool_indexer(ax, key): result = result.reindex(ax) mask = isnull(result._values) if mask.any(): - raise IndexingError('Unalignable boolean Series key provided') + raise IndexingError('Unalignable boolean Series provided as ' + 'indexer (index of the boolean Series and of ' + 'the indexed object do not match') result = result.astype(bool)._values elif is_sparse(result): result = result.to_dense() diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 45b6cad89d020..132543e0e386c 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -25,7 +25,8 @@ from pandas.core.common import (_values_from_object, is_bool_indexer, is_null_slice, - PerformanceWarning) + PerformanceWarning, + UnsortedIndexError) from pandas.core.base import FrozenList @@ -1936,9 +1937,10 @@ def get_locs(self, tup): # must be lexsorted to at least as many levels if not self.is_lexsorted_for_tuple(tup): - raise KeyError('MultiIndex Slicing requires the index to be fully ' - 'lexsorted tuple len ({0}), lexsort depth ' - '({1})'.format(len(tup), self.lexsort_depth)) + raise UnsortedIndexError('MultiIndex Slicing requires the index ' + 'to be fully lexsorted tuple len ({0}), ' + 'lexsort depth ({1})' + .format(len(tup), self.lexsort_depth)) # indexer # this is the list of all values that we want to select diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e1e714719092a..ccbe65e58a1a5 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -8,7 +8,7 @@ from pandas import (DataFrame, date_range, period_range, MultiIndex, Index, CategoricalIndex, compat) -from pandas.core.common import PerformanceWarning +from pandas.core.common import PerformanceWarning, UnsortedIndexError from pandas.indexes.base import InvalidIndexError from pandas.compat import range, lrange, u, PY3, long, lzip @@ -2535,3 +2535,19 @@ def test_dropna(self): msg = "invalid how option: xxx" with tm.assertRaisesRegexp(ValueError, msg): idx.dropna(how='xxx') + + def test_unsortedindex(self): + # GH 11897 + mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), + ('x', 'b'), ('y', 'a'), ('z', 'b')], + names=['one', 'two']) + df = pd.DataFrame([[i, 10 * i] for i in lrange(6)], index=mi, + columns=['one', 'two']) + + with assertRaises(UnsortedIndexError): + df.loc(axis=0)['z', :] + df.sort_index(inplace=True) + self.assertEqual(len(df.loc(axis=0)['z', :]), 2) + + with assertRaises(KeyError): + df.loc(axis=0)['q', :] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9ca1fd2a76817..bc95ff329d686 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -23,7 +23,7 @@ MultiIndex, Timestamp, Timedelta) from pandas.formats.printing import pprint_thing from pandas import concat -from pandas.core.common import PerformanceWarning +from pandas.core.common import PerformanceWarning, UnsortedIndexError import pandas.util.testing as tm from pandas import date_range @@ -2230,7 +2230,7 @@ def f(): df = df.sortlevel(level=1, axis=0) self.assertEqual(df.index.lexsort_depth, 0) with tm.assertRaisesRegexp( - KeyError, + UnsortedIndexError, 'MultiIndex Slicing requires the index to be fully ' r'lexsorted tuple len \(2\), lexsort depth \(0\)'): df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] @@ -2417,7 +2417,7 @@ def test_per_axis_per_level_doc_examples(self): def f(): df.loc['A1', (slice(None), 'foo')] - self.assertRaises(KeyError, f) + self.assertRaises(UnsortedIndexError, f) df = df.sortlevel(axis=1) # slicing @@ -3480,8 +3480,12 @@ def test_iloc_mask(self): ('index', '.loc'): '0b11', ('index', '.iloc'): ('iLocation based boolean indexing ' 'cannot use an indexable as a mask'), - ('locs', ''): 'Unalignable boolean Series key provided', - ('locs', '.loc'): 'Unalignable boolean Series key provided', + ('locs', ''): 'Unalignable boolean Series provided as indexer ' + '(index of the boolean Series and of the indexed ' + 'object do not match', + ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' + '(index of the boolean Series and of the ' + 'indexed object do not match', ('locs', '.iloc'): ('iLocation based boolean indexing on an ' 'integer type is not available'), } From 0699c89882133a41c250abdac02796fec84512e8 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 9 Dec 2016 14:34:31 -0500 Subject: [PATCH 141/183] BF(TST): use = (native) instead of < (little endian) for target data types (#14832) --- pandas/io/tests/parser/common.py | 14 +++++++------- pandas/tests/series/test_datetime_values.py | 2 +- pandas/tseries/tests/test_timeseries.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 6eb73876c11dd..b6d1d4bb09f56 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1453,7 +1453,7 @@ def test_as_recarray(self): FutureWarning, check_stacklevel=False): data = 'a,b\n1,a\n2,b' expected = np.array([(1, 'a'), (2, 'b')], - dtype=[('a', ' Date: Fri, 9 Dec 2016 17:17:18 -0500 Subject: [PATCH 142/183] BLD: use org name in build-docs.sh --- ci/build_docs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index fac9edbab89d3..d55dce1344a64 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -55,7 +55,7 @@ if [ x"$DOC_BUILD" != x"" ]; then touch .nojekyll git add --all . git commit -m "Version" --allow-empty - git remote add origin "https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis" + git remote add origin "https://pandas-docs:$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis" git push origin gh-pages -f fi From 3710f2e68152c81f3989e9ed257a8843687bb82c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 10 Dec 2016 12:02:53 +0100 Subject: [PATCH 143/183] TST: add test to confirm GH14606 (specify category dtype for empty) (#14752) Issue #14606 was fixed by PR #14717, adding one more specific test to confirm this --- pandas/io/tests/parser/dtypes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index 18c37b31f6480..b9ab79c3b9d54 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -241,6 +241,9 @@ def test_empty_dtype(self): result = self.read_csv(StringIO(data), header=0, dtype='category') tm.assert_frame_equal(result, expected) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': 'category', 'b': 'category'}) + tm.assert_frame_equal(result, expected) expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') result = self.read_csv(StringIO(data), header=0, From 6e09022c9ed009e94a88d2137f30bea008f416c0 Mon Sep 17 00:00:00 2001 From: wandersoncferreira Date: Sat, 10 Dec 2016 09:12:00 -0200 Subject: [PATCH 144/183] DOC: add section on groupby().rolling/expanding/resample (#14801) --- doc/source/computation.rst | 5 ++++ doc/source/groupby.rst | 48 ++++++++++++++++++++++++++++++++++++++ doc/source/timeseries.rst | 3 +++ 3 files changed, 56 insertions(+) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 1414d2dd3c8dc..d727424750be5 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -214,6 +214,11 @@ computing common *window* or *rolling* statistics. Among these are count, sum, mean, median, correlation, variance, covariance, standard deviation, skewness, and kurtosis. +Starting in version 0.18.1, the ``rolling()`` and ``expanding()`` +functions can be used directly from DataFrameGroupBy objects, +see the :ref:`groupby docs `. + + .. note:: The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here ` diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index c5a77770085d6..ff97775afc2e2 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -614,6 +614,54 @@ and that the transformed data contains no NAs. grouped.ffill() + +.. _groupby.transform.window_resample: + +New syntax to window and resample operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. versionadded:: 0.18.1 + +Working with the resample, expanding or rolling operations on the groupby +level used to require the application of helper functions. However, +now it is possible to use ``resample()``, ``expanding()`` and +``rolling()`` as methods on groupbys. + +The example below will apply the ``rolling()`` method on the samples of +the column B based on the groups of column A. + +.. ipython:: python + + df = pd.DataFrame({'A': [1] * 10 + [5] * 10, + 'B': np.arange(20)}) + df + + df.groupby('A').rolling(4).B.mean() + + +The ``expanding()`` method will accumulate a given operation +(``sum()`` in the example) for all the members of each particular +group. + +.. ipython:: python + + df.groupby('A').expanding().sum() + + +Suppose you want to use the ``resample()`` method to get a daily +frequency in each group of your dataframe and wish to complete the +missing values with the ``ffill()`` method. + +.. ipython:: python + + df = pd.DataFrame({'date': pd.date_range(start='2016-01-01', + periods=4, + freq='W'), + 'group': [1, 1, 2, 2], + 'val': [5, 6, 7, 8]}).set_index('date') + df + + df.groupby('group').resample('1D').ffill() + .. _groupby.filter: Filtration diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 854de443ac5ee..9253124f7e8b2 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1288,6 +1288,9 @@ limited to, financial applications. ``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. See some :ref:`cookbook examples ` for some advanced strategies +Starting in version 0.18.1, the ``resample()`` function can be used directly from +DataFrameGroupBy objects, see the :ref:`groupby docs `. + .. note:: ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion :ref:`here ` From 1dbc7bedcde3274d945790d17c864eecf787a9fa Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Sat, 10 Dec 2016 06:19:49 -0500 Subject: [PATCH 145/183] ENH: add timedelta as valid type for interpolate with method='time' (#14799) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/missing.py | 5 +++-- pandas/tests/series/test_missing.py | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d89103309e990..f534c67273560 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -59,6 +59,7 @@ Other enhancements - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) +- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b847415f274db..f1191ff1c7009 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -12,7 +12,8 @@ is_float_dtype, is_datetime64_dtype, is_integer_dtype, _ensure_float64, is_scalar, - _DATELIKE_DTYPES) + _DATELIKE_DTYPES, + needs_i8_conversion) from pandas.types.missing import isnull @@ -187,7 +188,7 @@ def _interp_limit(invalid, fw_limit, bw_limit): if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 - if issubclass(inds.dtype.type, np.datetime64): + if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 4e6c58df54dfd..5666a07cad4b8 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -891,6 +891,23 @@ def test_spline_error(self): with tm.assertRaises(ValueError): s.interpolate(method='spline', order=0) + def test_interp_timedelta64(self): + # GH 6424 + df = Series([1, np.nan, 3], + index=pd.to_timedelta([1, 2, 3])) + result = df.interpolate(method='time') + expected = Series([1., 2., 3.], + index=pd.to_timedelta([1, 2, 3])) + assert_series_equal(result, expected) + + # test for non uniform spacing + df = Series([1, np.nan, 3], + index=pd.to_timedelta([1, 2, 4])) + result = df.interpolate(method='time') + expected = Series([1., 1.666667, 3.], + index=pd.to_timedelta([1, 2, 4])) + assert_series_equal(result, expected) + if __name__ == '__main__': import nose From ad3eca1ef52cb426ca7e0ee600249d0724a4b9e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 10 Dec 2016 15:30:31 +0100 Subject: [PATCH 146/183] CLN/PERF: clean-up of the benchmarks (#14099) --- asv_bench/benchmarks/algorithms.py | 21 +- asv_bench/benchmarks/attrs_caching.py | 23 +- asv_bench/benchmarks/binary_ops.py | 239 +-- asv_bench/benchmarks/categoricals.py | 92 +- asv_bench/benchmarks/ctors.py | 46 +- asv_bench/benchmarks/eval.py | 41 +- asv_bench/benchmarks/frame_ctor.py | 1728 +--------------------- asv_bench/benchmarks/frame_methods.py | 1100 +++++--------- asv_bench/benchmarks/gil.py | 173 +-- asv_bench/benchmarks/groupby.py | 477 +++--- asv_bench/benchmarks/hdfstore_bench.py | 331 +---- asv_bench/benchmarks/index_object.py | 334 ++--- asv_bench/benchmarks/indexing.py | 491 ++---- asv_bench/benchmarks/inference.py | 170 +-- asv_bench/benchmarks/io_sql.py | 202 +-- asv_bench/benchmarks/join_merge.py | 448 +++--- asv_bench/benchmarks/miscellaneous.py | 52 - asv_bench/benchmarks/packers.py | 778 ++-------- asv_bench/benchmarks/pandas_vb_common.py | 4 +- asv_bench/benchmarks/panel_ctor.py | 8 +- asv_bench/benchmarks/panel_methods.py | 46 +- asv_bench/benchmarks/parser_vb.py | 168 +-- asv_bench/benchmarks/period.py | 40 +- asv_bench/benchmarks/plotting.py | 6 +- asv_bench/benchmarks/reindex.py | 423 ++---- asv_bench/benchmarks/strings.py | 368 +---- asv_bench/benchmarks/timedelta.py | 52 +- asv_bench/benchmarks/timeseries.py | 1219 +++------------ 28 files changed, 1940 insertions(+), 7140 deletions(-) delete mode 100644 asv_bench/benchmarks/miscellaneous.py diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 53b7d55368f6a..c4a6117c0704a 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -3,7 +3,7 @@ from pandas.util import testing as tm -class algorithm(object): +class Algorithms(object): goal_time = 0.2 def setup(self): @@ -24,21 +24,28 @@ def setup(self): self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) - def time_int_factorize(self): + # match + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) + + def time_factorize_int(self): self.int.factorize() - def time_float_factorize(self): + def time_factorize_float(self): self.int.factorize() - def time_int_unique_duplicated(self): + def time_duplicated_int_unique(self): self.int_unique.duplicated() - def time_int_duplicated(self): + def time_duplicated_int(self): self.int.duplicated() - def time_float_duplicated(self): + def time_duplicated_float(self): self.float.duplicated() + def time_match_strings(self): + pd.match(self.all, self.uniques) + def time_add_overflow_pos_scalar(self): self.checked_add(self.arr, 1) @@ -58,7 +65,7 @@ def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) -class hashing(object): +class Hashing(object): goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index de9aa18937985..9210f1f2878d4 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,23 +1,32 @@ from .pandas_vb_common import * +from pandas.util.decorators import cache_readonly -class getattr_dataframe_index(object): +class DataFrameAttributes(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index - def time_getattr_dataframe_index(self): + def time_get_index(self): self.foo = self.df.index + def time_set_index(self): + self.df.index = self.cur_index + -class setattr_dataframe_index(object): +class CacheReadonly(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10, 6)) - self.cur_index = self.df.index - def time_setattr_dataframe_index(self): - self.df.index = self.cur_index + class Foo: + + @cache_readonly + def prop(self): + return 5 + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index d22d01f261b27..53cb1cf465698 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -2,193 +2,79 @@ import pandas.computation.expressions as expr -class frame_add(object): +class Ops(object): goal_time = 0.2 - def setup(self): + params = [[True, False], ['default', 1]] + param_names = ['use_numexpr', 'threads'] + + def setup(self, use_numexpr, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) - def time_frame_add(self): - (self.df + self.df2) + if threads != 'default': + expr.set_numexpr_threads(threads) + if not use_numexpr: + expr.set_use_numexpr(False) -class frame_add_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_add_no_ne(self): + def time_frame_add(self, use_numexpr, threads): (self.df + self.df2) - def teardown(self): - expr.set_use_numexpr(True) + def time_frame_mult(self, use_numexpr, threads): + (self.df * self.df2) + def time_frame_multi_and(self, use_numexpr, threads): + self.df[((self.df > 0) & (self.df2 > 0))] -class frame_add_st(object): - goal_time = 0.2 + def time_frame_comparison(self, use_numexpr, threads): + (self.df > self.df2) - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_add_st(self): - (self.df + self.df2) - - def teardown(self): + def teardown(self, use_numexpr, threads): + expr.set_use_numexpr(True) expr.set_numexpr_threads() -class frame_float_div(object): +class Ops2(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(1000, 1000)) self.df2 = DataFrame(np.random.randn(1000, 1000)) - def time_frame_float_div(self): - (self.df // self.df2) + self.df_int = DataFrame( + np.random.random_integers(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(1000, 1000))) + self.df2_int = DataFrame( + np.random.random_integers(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(1000, 1000))) + ## Division -class frame_float_div_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) + def time_frame_float_div(self): + (self.df // self.df2) def time_frame_float_div_by_zero(self): (self.df / 0) - -class frame_float_floor_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - def time_frame_float_floor_by_zero(self): (self.df // 0) - -class frame_float_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_float_mod(self): - (self.df / self.df2) - - -class frame_int_div_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) - def time_frame_int_div_by_zero(self): - (self.df / 0) - + (self.df_int / 0) -class frame_int_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) - self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + ## Modulo def time_frame_int_mod(self): (self.df / self.df2) - -class frame_mult(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_mult(self): - (self.df * self.df2) - - -class frame_mult_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_mult_no_ne(self): - (self.df * self.df2) - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_mult_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_mult_st(self): - (self.df * self.df2) - - def teardown(self): - expr.set_numexpr_threads() - - -class frame_multi_and(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_multi_and(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - -class frame_multi_and_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_multi_and_no_ne(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_multi_and_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_multi_and_st(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - def teardown(self): - expr.set_numexpr_threads() + def time_frame_float_mod(self): + (self.df / self.df2) -class series_timestamp_compare(object): +class Timeseries(object): goal_time = 0.2 def setup(self): @@ -197,65 +83,28 @@ def setup(self): self.s = Series(date_range('20010101', periods=self.N, freq='T')) self.ts = self.s[self.halfway] + self.s2 = Series(date_range('20010101', periods=self.N, freq='s')) + def time_series_timestamp_compare(self): (self.s <= self.ts) - -class timestamp_ops_diff1(object): - goal_time = 0.2 - N = 1000000 - - def setup(self): - self.s = self.create() - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + def time_timestamp_series_compare(self): + (self.ts >= self.s) def time_timestamp_ops_diff1(self): - self.s.diff() - -class timestamp_tz_ops_diff1(timestamp_ops_diff1): - N = 10000 - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) - -class timestamp_ops_diff2(object): - goal_time = 0.2 - N = 1000000 - - def setup(self): - self.s = self.create() - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + self.s2.diff() def time_timestamp_ops_diff2(self): (self.s - self.s.shift()) -class timestamp_tz_ops_diff2(timestamp_ops_diff2): - N = 10000 - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) -class timestamp_series_compare(object): - goal_time = 0.2 - N = 1000000 +class TimeseriesTZ(Timeseries): def setup(self): + self.N = 1000000 self.halfway = ((self.N // 2) - 1) - self.s = self.create() + self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) self.ts = self.s[self.halfway] - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T')) - - def time_timestamp_series_compare(self): - (self.ts >= self.s) - -class timestamp_tz_series_compare(timestamp_series_compare): - N = 10000 - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) + self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) \ No newline at end of file diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index bf1e1b3f40ab0..cca652c68cf15 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -3,89 +3,63 @@ from pandas.types.concat import union_categoricals except ImportError: pass -import string -class concat_categorical(object): +class Categoricals(object): goal_time = 0.2 def setup(self): - self.s = pd.Series((list('aabbcd') * 1000000)).astype('category') + N = 100000 + self.s = pd.Series((list('aabbcd') * N)).astype('category') - def time_concat_categorical(self): - concat([self.s, self.s]) + self.a = pd.Categorical((list('aabbcd') * N)) + self.b = pd.Categorical((list('bbcdjk') * N)) + self.categories = list('abcde') + self.cat_idx = Index(self.categories) + self.values = np.tile(self.categories, N) + self.codes = np.tile(range(len(self.categories)), N) -class union_categorical(object): - goal_time = 0.2 + self.datetimes = pd.Series(pd.date_range( + '1995-01-01 00:00:00', periods=10000, freq='s')) - def setup(self): - self.a = pd.Categorical((list('aabbcd') * 1000000)) - self.b = pd.Categorical((list('bbcdjk') * 1000000)) + def time_concat(self): + concat([self.s, self.s]) - def time_union_categorical(self): + def time_union(self): union_categoricals([self.a, self.b]) - -class categorical_value_counts(object): - goal_time = 1 - - def setup(self): - n = 500000 - np.random.seed(2718281) - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') - - def time_value_counts(self): - self.ts.value_counts(dropna=False) - - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) - - -class categorical_constructor(object): - goal_time = 0.2 - - def setup(self): - n = 5 - N = 1e6 - self.categories = list(string.ascii_letters[:n]) - self.cat_idx = Index(self.categories) - self.values = np.tile(self.categories, N) - self.codes = np.tile(range(n), N) - - def time_regular_constructor(self): + def time_constructor_regular(self): Categorical(self.values, self.categories) - def time_fastpath(self): + def time_constructor_fastpath(self): Categorical(self.codes, self.cat_idx, fastpath=True) - -class categorical_constructor_with_datetimes(object): - goal_time = 0.2 - - def setup(self): - self.datetimes = pd.Series(pd.date_range( - '1995-01-01 00:00:00', periods=10000, freq='s')) - - def time_datetimes(self): + def time_constructor_datetimes(self): Categorical(self.datetimes) - def time_datetimes_with_nat(self): + def time_constructor_datetimes_with_nat(self): t = self.datetimes t.iloc[-1] = pd.NaT Categorical(t) -class categorical_rendering(object): - goal_time = 3e-3 +class Categoricals2(object): + goal_time = 0.2 def setup(self): - n = 1000 - items = [str(i) for i in range(n)] - s = pd.Series(items, dtype='category') - df = pd.DataFrame({'C': s, 'data': np.random.randn(n)}) - self.data = df[df.C == '20'] + n = 500000 + np.random.seed(2718281) + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = Series(arr).astype('category') + + self.sel = self.ts.loc[[0]] + + def time_value_counts(self): + self.ts.value_counts(dropna=False) + + def time_value_counts_dropna(self): + self.ts.value_counts(dropna=True) def time_rendering(self): - str(self.data.C) + str(self.sel) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index f68cf9399c546..b5694a3a21502 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,52 +1,30 @@ from .pandas_vb_common import * -class frame_constructor_ndarray(object): +class Constructors(object): goal_time = 0.2 def setup(self): self.arr = np.random.randn(100, 100) + self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - def time_frame_constructor_ndarray(self): - DataFrame(self.arr) - - -class ctor_index_array_string(object): - goal_time = 0.2 - - def setup(self): - self.data = np.array(['foo', 'bar', 'baz'], dtype=object) - - def time_ctor_index_array_string(self): - Index(self.data) - - -class series_constructor_ndarray(object): - goal_time = 0.2 - - def setup(self): self.data = np.random.randn(100) self.index = Index(np.arange(100)) - def time_series_constructor_ndarray(self): - Series(self.data, index=self.index) + self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * 1000)) + def time_frame_from_ndarray(self): + DataFrame(self.arr) -class dtindex_from_series_ctor(object): - goal_time = 0.2 + def time_series_from_ndarray(self): + pd.Series(self.data, index=self.index) - def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + def time_index_from_array_string(self): + Index(self.arr_str) - def time_dtindex_from_series_ctor(self): + def time_dtindex_from_series(self): DatetimeIndex(self.s) - -class index_from_series_ctor(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) - - def time_index_from_series_ctor(self): + def time_dtindex_from_series2(self): Index(self.s) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index d9978e0cc4595..a0819e33dc254 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -3,7 +3,7 @@ import pandas.computation.expressions as expr -class eval_frame(object): +class Eval(object): goal_time = 0.2 params = [['numexpr', 'python'], [1, 'all']] @@ -34,8 +34,11 @@ def time_mult(self, engine, threads): df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 pd.eval('df * df2 * df3 * df4', engine=engine) + def teardown(self, engine, threads): + expr.set_numexpr_threads() -class query_datetime_index(object): + +class Query(object): goal_time = 0.2 def setup(self): @@ -45,41 +48,19 @@ def setup(self): self.s = Series(self.index) self.ts = self.s.iloc[self.halfway] self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) + self.df2 = DataFrame({'dates': self.s.values,}) + + self.df3 = DataFrame({'a': np.random.randn(self.N),}) + self.min_val = self.df3['a'].min() + self.max_val = self.df3['a'].max() def time_query_datetime_index(self): ts = self.ts self.df.query('index < @ts') - -class query_datetime_series(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'dates': self.s.values, }) - def time_query_datetime_series(self): ts = self.ts - self.df.query('dates < @ts') - - -class query_with_boolean_selection(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.N = 1000000 - self.df = DataFrame({'a': np.random.randn(self.N), }) - self.min_val = self.df['a'].min() - self.max_val = self.df['a'].max() + self.df2.query('dates < @ts') def time_query_with_boolean_selection(self): min_val, max_val = self.min_val, self.max_val diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 6f40611e68531..05c1a27fdf8ca 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -5,1617 +5,10 @@ from pandas.core.datetools import * -class frame_ctor_dtindex_BDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_LastWeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_LastWeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) +#---------------------------------------------------------------------- +# Creation from nested dict - -class frame_ctor_dtindex_MonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Weekx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Weekx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_list_of_dict(object): +class FromDicts(object): goal_time = 0.2 def setup(self): @@ -1630,39 +23,26 @@ def setup(self): self.some_dict = self.data.values()[0] self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + self.data2 = dict( + ((i, dict(((j, float(j)) for j in range(100)))) for i in + xrange(2000))) + def time_frame_ctor_list_of_dict(self): DataFrame(self.dict_list) - -class frame_ctor_nested_dict(object): - goal_time = 0.2 - - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - def time_frame_ctor_nested_dict(self): DataFrame(self.data) - -class frame_ctor_nested_dict_int64(object): - goal_time = 0.2 - - def setup(self): - self.data = dict(((i, dict(((j, float(j)) for j in range(100)))) for i in xrange(2000))) + def time_series_ctor_from_dict(self): + Series(self.some_dict) def time_frame_ctor_nested_dict_int64(self): + # nested dict, integer indexes, regression described in #621 DataFrame(self.data) +# from a mi-series + class frame_from_series(object): goal_time = 0.2 @@ -1670,10 +50,13 @@ def setup(self): self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) self.s = Series(randn(10000), index=self.mi) - def time_frame_from_series(self): + def time_frame_from_mi_series(self): DataFrame(self.s) +#---------------------------------------------------------------------- +# get_numeric_data + class frame_get_numeric_data(object): goal_time = 0.2 @@ -1687,20 +70,69 @@ def time_frame_get_numeric_data(self): self.df._get_numeric_data() -class series_ctor_from_dict(object): - goal_time = 0.2 +# ---------------------------------------------------------------------- +# From dict with DatetimeIndex with all offsets - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] +# dynamically generate benchmarks for every offset +# +# get_period_count & get_index_for_offset are there because blindly taking each +# offset times 1000 can easily go out of Timestamp bounds and raise errors. - def time_series_ctor_from_dict(self): - Series(self.some_dict) + +def get_period_count(start_date, off): + ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // ten_offsets_in_days)), 1000) + + +def get_index_for_offset(off): + start_date = Timestamp('1/1/1900') + return date_range(start_date, periods=min(1000, get_period_count( + start_date, off)), freq=off) + + +all_offsets = offsets.__all__ +# extra cases +for off in ['FY5253', 'FY5253Quarter']: + all_offsets.pop(all_offsets.index(off)) + all_offsets.extend([off + '_1', off + '_2']) + + +class FrameConstructorDTIndexFromOffsets(object): + + params = [all_offsets, [1, 2]] + param_names = ['offset', 'n_steps'] + + offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, + 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, + 'FY5253': {'startingMonth': 1, 'weekday': 1}, + 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + + offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, + 'FY5253Quarter': {'variation': ['nearest', 'last']}} + + def setup(self, offset, n_steps): + + extra = False + if offset.endswith("_", None, -1): + extra = int(offset[-1]) + offset = offset[:-2] + + kwargs = {} + if offset in self.offset_kwargs: + kwargs = self.offset_kwargs[offset] + + if extra: + extras = self.offset_extra_cases[offset] + for extra_arg in extras: + kwargs[extra_arg] = extras[extra_arg][extra -1] + + offset = getattr(offsets, offset) + self.idx = get_index_for_offset(offset(n_steps, **kwargs)) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor(self, offset, n_steps): + DataFrame(self.d) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index df73a474b2683..3daffb9d3a1cc 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -2,444 +2,72 @@ import string -class frame_apply_axis_1(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) - - -class frame_apply_lambda_mean(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_lambda_mean(self): - self.df.apply((lambda x: x.sum())) - - -class frame_apply_np_mean(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_np_mean(self): - self.df.apply(np.mean) - - -class frame_apply_pass_thru(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_pass_thru(self): - self.df.apply((lambda x: x)) - - -class frame_apply_ref_by_name(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - - def time_frame_apply_ref_by_name(self): - self.df.apply((lambda x: (x['A'] + x['B'])), axis=1) - - -class frame_apply_user_func(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.arange(1028.0)) - self.df = DataFrame({i: self.s for i in range(1028)}) - - def time_frame_apply_user_func(self): - self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) - - -class frame_assign_timeseries_index(object): - goal_time = 0.2 - - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='D') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) - - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index - - -class frame_boolean_row_select(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - - -class frame_count_level_axis0_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_mixed_dtypes_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis0_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis1_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_mixed_dtypes_multi(self): - self.df.count(axis=1, level=1) - - -class frame_count_level_axis1_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_multi(self): - self.df.count(axis=1, level=1) - - -class frame_dropna_axis0_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis0_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis1_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) - - -class frame_dropna_axis1_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=1) - - -class frame_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_dtypes(self): - self.df.dtypes - - -class frame_duplicated(object): - goal_time = 0.2 - - def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - def time_frame_duplicated(self): - self.df.duplicated() - -class frame_duplicated_wide(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100).astype(str)) - - def time_frame_duplicated_wide(self): - self.df.T.duplicated() - -class frame_fancy_lookup(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) - self.df['foo'] = 'bar' - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - - def time_frame_fancy_lookup(self): - self.df.lookup(self.row_labels, self.col_labels) - - -class frame_fancy_lookup_all(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) - self.df['foo'] = 'bar' - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - - def time_frame_fancy_lookup_all(self): - self.df.lookup(self.row_labels_all, self.col_labels_all) - - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - +#---------------------------------------------------------------------- +# lookup -class frame_float_equal(object): +class frame_fancy_lookup(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_equal(self): - self.test_equal('float_df') + self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df['foo'] = 'bar' + self.row_labels = list(self.df.index[::10])[:900] + self.col_labels = (list(self.df.columns) * 100) + self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') + self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_frame_fancy_lookup(self): + self.df.lookup(self.row_labels, self.col_labels) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_frame_fancy_lookup_all(self): + self.df.lookup(self.row_labels_all, self.col_labels_all) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +#---------------------------------------------------------------------- +# reindex -class frame_float_unequal(object): +class Reindex(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_unequal(self): - self.test_unequal('float_df') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) - - -class frame_from_records_generator(object): - goal_time = 0.2 - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) - - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) - - -class frame_from_records_generator_nrows(object): - goal_time = 0.2 + self.df = DataFrame(randn(10000, 1000)) + self.idx = np.arange(4000, 7000) - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) + self.df2 = DataFrame( + dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), + 1: randint(0, 1000, 1000).astype( + np.int16), + 2: randint(0, 1000, 1000).astype( + np.int32), + 3: randint(0, 1000, 1000).astype( + np.int64),}[randint(0, 4)]) for c in + range(1000)])) + + def time_reindex_axis0(self): + self.df.reindex(self.idx) - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) + def time_reindex_axis1(self): + self.df.reindex(columns=self.idx) + def time_reindex_both_axes(self): + self.df.reindex(index=self.idx, columns=self.idx) -class frame_get_dtype_counts(object): - goal_time = 0.2 + def time_reindex_both_axes_ix(self): + self.df.ix[(self.idx, self.idx)] - def setup(self): - self.df = DataFrame(np.random.randn(10, 10000)) + def time_reindex_upcast(self): + self.df2.reindex(permutation(range(1200))) - def time_frame_get_dtype_counts(self): - self.df.get_dtype_counts() +#---------------------------------------------------------------------- +# iteritems (monitor no-copying behaviour) -class frame_getitem_single_column(object): +class Iteration(object): goal_time = 0.2 def setup(self): self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column(self): - self.h() + self.df2 = DataFrame(np.random.randn(50000, 10)) def f(self): if hasattr(self.df, '_item_cache'): @@ -451,290 +79,254 @@ def g(self): for (name, col) in self.df.iteritems(): pass - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - -class frame_getitem_single_column2(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column2(self): - self.j() + def time_iteritems(self): + self.f() - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + def time_iteritems_cached(self): + self.g() - def g(self): - for (name, col) in self.df.iteritems(): + def time_itertuples(self): + for row in self.df2.itertuples(): pass - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] +#---------------------------------------------------------------------- +# to_string, to_html, repr -class frame_html_repr_trunc_mi(object): +class Formatting(object): goal_time = 0.2 def setup(self): - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, (self.nrows / 100)), 100)) - self.df = DataFrame(self.data, index=self.idx) - - def time_frame_html_repr_trunc_mi(self): - self.df._repr_html_() - + self.df = DataFrame(randn(100, 10)) -class frame_html_repr_trunc_si(object): - goal_time = 0.2 + self.nrows = 500 + self.df2 = DataFrame(randn(self.nrows, 10)) + self.df2[0] = period_range('2000', '2010', self.nrows) + self.df2[1] = range(self.nrows) - def setup(self): self.nrows = 10000 self.data = randn(self.nrows, 10) + self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) + self.df3 = DataFrame(self.data, index=self.idx) self.idx = randn(self.nrows) - self.df = DataFrame(self.data, index=self.idx) - - def time_frame_html_repr_trunc_si(self): - self.df._repr_html_() - - -class frame_insert_100_columns_begin(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000 + self.df4 = DataFrame(self.data, index=self.idx) - def time_frame_insert_100_columns_begin(self): - self.f() + self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) + self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + def time_to_string_floats(self): + self.df.to_string() -class frame_insert_500_columns_end(object): - goal_time = 0.2 + def time_to_html_mixed(self): + self.df2.to_html() - def setup(self): - self.N = 1000 + def time_html_repr_trunc_mi(self): + self.df3._repr_html_() - def time_frame_insert_500_columns_end(self): - self.f() + def time_html_repr_trunc_si(self): + self.df4._repr_html_() - def f(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col + def time_repr_tall(self): + repr(self.df_tall) + def time_frame_repr_wide(self): + repr(self.df_wide) -class frame_interpolate(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan +#---------------------------------------------------------------------- +# nulls/masking - def time_frame_interpolate(self): - self.df.interpolate() +## masking -class frame_interpolate_some_good(object): +class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan - - def time_frame_interpolate_some_good(self): - self.df.interpolate() - + self.data = np.random.randn(1000, 500) + self.df = DataFrame(self.data) + self.df = self.df.where((self.df > 0)) + self.bools = (self.df > 0) + self.mask = isnull(self.df) -class frame_interpolate_some_good_infer(object): - goal_time = 0.2 + def time_frame_mask_bools(self): + self.bools.mask(self.mask) - def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan + def time_frame_mask_floats(self): + self.bools.astype(float).mask(self.mask) - def time_frame_interpolate_some_good_infer(self): - self.df.interpolate(downcast='infer') +## isnull -class frame_isnull_floats_no_null(object): +class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 1000) - self.df = DataFrame(self.data) - - def time_frame_isnull(self): - isnull(self.df) - - -class frame_isnull_floats(object): - goal_time = 0.2 + self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - def setup(self): np.random.seed(1234) self.sample = np.array([np.nan, 1.0]) self.data = np.random.choice(self.sample, (1000, 1000)) self.df = DataFrame(self.data) - def time_frame_isnull(self): - isnull(self.df) - - -class frame_isnull_strings(object): - goal_time = 0.2 - - def setup(self): np.random.seed(1234) self.sample = np.array(list(string.ascii_lowercase) + list(string.ascii_uppercase) + list(string.whitespace)) self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) - - def time_frame_isnull(self): - isnull(self.df) - + self.df_strings= DataFrame(self.data) -class frame_isnull_obj(object): - goal_time = 0.2 - - def setup(self): np.random.seed(1234) self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) + self.df_obj = DataFrame(self.data) - def time_frame_isnull(self): + def time_isnull_floats_no_null(self): + isnull(self.df_no_null) + + def time_isnull(self): isnull(self.df) + def time_isnull_strngs(self): + isnull(self.df_strings) + + def time_isnull_obj(self): + isnull(self.df_obj) + + +# ---------------------------------------------------------------------- +# fillna in place -class frame_iteritems(object): +class frame_fillna_inplace(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan - def time_frame_iteritems(self): - self.f() + def time_frame_fillna_inplace(self): + self.df.fillna(0, inplace=True) - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass - def g(self): - for (name, col) in self.df.iteritems(): - pass - def h(self): - for i in range(10000): - self.df2['A'] +class frame_fillna_many_columns_pad(object): + goal_time = 0.2 - def j(self): - for i in range(10000): - self.df3[0] + def setup(self): + self.values = np.random.randn(1000, 1000) + self.values[::2] = np.nan + self.df = DataFrame(self.values) + + def time_frame_fillna_many_columns_pad(self): + self.df.fillna(method='pad') -class frame_iteritems_cached(object): + +class Dropna(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def time_frame_iteritems_cached(self): - self.g() + self.df_mi = self.df.copy() + self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) + self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + self.df_mixed_mi = self.df_mixed.copy() + self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) + self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) - def g(self): - for (name, col) in self.df.iteritems(): - pass + def time_dropna_axis0_all(self): + self.df.dropna(how='all', axis=0) + + def time_dropna_axis0_any(self): + self.df.dropna(how='any', axis=0) + + def time_dropna_axis1_all(self): + self.df.dropna(how='all', axis=1) + + def time_dropna_axis1_any(self): + self.df.dropna(how='any', axis=1) + + def time_dropna_axis0_all_mixed_dtypes(self): + self.df_mixed.dropna(how='all', axis=0) - def h(self): - for i in range(10000): - self.df2['A'] + def time_dropna_axis0_any_mixed_dtypes(self): + self.df_mixed.dropna(how='any', axis=0) - def j(self): - for i in range(10000): - self.df3[0] + def time_dropna_axis1_all_mixed_dtypes(self): + self.df_mixed.dropna(how='all', axis=1) + def time_dropna_axis1_any_mixed_dtypes(self): + self.df_mixed.dropna(how='any', axis=1) -class frame_itertuples(object): + def time_count_level_axis0_multi(self): + self.df_mi.count(axis=0, level=1) - def setup(self): - self.df = DataFrame(np.random.randn(50000, 10)) + def time_count_level_axis1_multi(self): + self.df_mi.count(axis=1, level=1) - def time_frame_itertuples(self): - for row in self.df.itertuples(): - pass + def time_count_level_axis0_mixed_dtypes_multi(self): + self.df_mixed_mi.count(axis=0, level=1) + def time_count_level_axis1_mixed_dtypes_multi(self): + self.df_mixed_mi.count(axis=1, level=1) -class frame_mask_bools(object): + +class Apply(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + self.df = DataFrame(np.random.randn(1000, 100)) - def time_frame_mask_bools(self): - self.bools.mask(self.mask) + self.s = Series(np.arange(1028.0)) + self.df2 = DataFrame({i: self.s for i in range(1028)}) + + self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + + def time_apply_user_func(self): + self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + + def time_apply_axis_1(self): + self.df.apply((lambda x: (x + 1)), axis=1) + + def time_apply_lambda_mean(self): + self.df.apply((lambda x: x.sum())) + + def time_apply_np_mean(self): + self.df.apply(np.mean) + + def time_apply_pass_thru(self): + self.df.apply((lambda x: x)) + + def time_apply_ref_by_name(self): + self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) -class frame_mask_floats(object): +#---------------------------------------------------------------------- +# dtypes + +class frame_dtypes(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + self.df = DataFrame(np.random.randn(1000, 1000)) - def time_frame_mask_floats(self): - self.bools.astype(float).mask(self.mask) + def time_frame_dtypes(self): + self.df.dtypes +#---------------------------------------------------------------------- +# equals -class frame_nonunique_equal(object): +class Equals(object): goal_time = 0.2 def setup(self): @@ -742,10 +334,9 @@ def setup(self): self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) self.nonunique_cols = self.object_df.copy() self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( + ('float_df', self.float_df), ('object_df', self.object_df), + ('nonunique_cols', self.nonunique_cols))]) def make_pair(self, frame): self.df = frame @@ -761,238 +352,259 @@ def test_unequal(self, name): (self.df, self.df2) = self.pairs[name] return self.df.equals(self.df2) + def time_frame_float_equal(self): + self.test_equal('float_df') -class frame_nonunique_unequal(object): - goal_time = 0.2 + def time_frame_float_unequal(self): + self.test_unequal('float_df') - def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + def time_frame_nonunique_equal(self): + self.test_equal('nonunique_cols') def time_frame_nonunique_unequal(self): self.test_unequal('nonunique_cols') - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_frame_object_equal(self): + self.test_equal('object_df') - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + def time_frame_object_unequal(self): + self.test_unequal('object_df') -class frame_object_equal(object): +class Interpolate(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + # this is the worst case, where every column has NaNs. + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan - def time_frame_object_equal(self): - self.test_equal('object_df') + self.df2 = DataFrame( + {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), + 'C': randn(10000), 'D': randn(10000),}) + self.df2.loc[1::5, 'A'] = np.nan + self.df2.loc[1::5, 'C'] = np.nan - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_interpolate(self): + self.df.interpolate() - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_interpolate_some_good(self): + self.df2.interpolate() - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + def time_interpolate_some_good_infer(self): + self.df2.interpolate(downcast='infer') -class frame_object_unequal(object): +class Shift(object): + # frame shift speedup issue-5609 goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.df = DataFrame(np.random.rand(10000, 500)) - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_shift_axis0(self): + self.df.shift(1, axis=0) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_shift_axis_1(self): + self.df.shift(1, axis=1) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +#----------------------------------------------------------------------------- +# from_records issue-6700 -class frame_reindex_axis0(object): +class frame_from_records_generator(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + def get_data(self, n=100000): + return ((x, (x * 20), (x * 100)) for x in range(n)) + + def time_frame_from_records_generator(self): + self.df = DataFrame.from_records(self.get_data()) + + def time_frame_from_records_generator_nrows(self): + self.df = DataFrame.from_records(self.get_data(), nrows=1000) - def time_frame_reindex_axis0(self): - self.df.reindex(self.idx) -class frame_reindex_axis1(object): +#----------------------------------------------------------------------------- +# duplicated + +class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + self.n = (1 << 20) + self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) + self.xs = np.random.randn((self.n // 64)).round(2) + self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) + + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + + def time_frame_duplicated(self): + self.df.duplicated() + + def time_frame_duplicated_wide(self): + self.df2.T.duplicated() + + - def time_frame_reindex_axis1(self): - self.df.reindex(columns=self.idx) -class frame_reindex_both_axes(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) - def time_frame_reindex_both_axes(self): - self.df.reindex(index=self.idx, columns=self.idx) -class frame_reindex_both_axes_ix(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) - def time_frame_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] -class frame_reindex_upcast(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), 1: randint(0, 1000, 1000).astype(np.int16), 2: randint(0, 1000, 1000).astype(np.int32), 3: randint(0, 1000, 1000).astype(np.int64), }[randint(0, 4)]) for c in range(1000)])) - def time_frame_reindex_upcast(self): - self.df.reindex(permutation(range(1200))) -class frame_repr_tall(object): +class frame_xs_col(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10000, 10)) + self.df = DataFrame(randn(1, 100000)) - def time_frame_repr_tall(self): - repr(self.df) + def time_frame_xs_col(self): + self.df.xs(50000, axis=1) -class frame_repr_wide(object): +class frame_xs_row(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(randn(100000, 1)) - def time_frame_repr_wide(self): - repr(self.df) + def time_frame_xs_row(self): + self.df.xs(50000) -class frame_shift_axis0(object): +class frame_sort_index(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + self.df = DataFrame(randn(1000000, 2), columns=list('AB')) - def time_frame_shift_axis0(self): - self.df.shift(1, axis=0) + def time_frame_sort_index(self): + self.df.sort_index() -class frame_shift_axis_1(object): +class frame_sort_index_by_columns(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) - def time_frame_shift_axis_1(self): - self.df.shift(1, axis=1) + def time_frame_sort_index_by_columns(self): + self.df.sort_index(by=['key1', 'key2']) -class frame_to_html_mixed(object): +class frame_quantile_axis1(object): goal_time = 0.2 def setup(self): - self.nrows = 500 - self.df = DataFrame(randn(self.nrows, 10)) - self.df[0] = period_range('2000', '2010', self.nrows) - self.df[1] = range(self.nrows) + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) - def time_frame_to_html_mixed(self): - self.df.to_html() +#---------------------------------------------------------------------- +# boolean indexing -class frame_to_string_floats(object): +class frame_boolean_row_select(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) - - def time_frame_to_string_floats(self): - self.df.to_string() + self.df = DataFrame(randn(10000, 100)) + self.bool_arr = np.zeros(10000, dtype=bool) + self.bool_arr[:1000] = True + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] -class frame_xs_col(object): +class frame_getitem_single_column(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(1, 100000)) + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) + def h(self): + for i in range(10000): + self.df2['A'] + def j(self): + for i in range(10000): + self.df3[0] -class frame_xs_row(object): + def time_frame_getitem_single_column(self): + self.h() + + def time_frame_getitem_single_column2(self): + self.j() + + +#---------------------------------------------------------------------- +# assignment + +class frame_assign_timeseries_index(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100000, 1)) + self.idx = date_range('1/1/2000', periods=100000, freq='D') + self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - def time_frame_xs_row(self): - self.df.xs(50000) + def time_frame_assign_timeseries_index(self): + self.f(self.df) + def f(self, df): + self.x = self.df.copy() + self.x['date'] = self.x.index -class frame_sort_index(object): + + +# insert many columns + +class frame_insert_100_columns_begin(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + self.N = 1000 - def time_frame_sort_index(self): - self.df.sort_index() + def f(self, K=100): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df.insert(0, i, self.new_col) + + def g(self, K=500): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df[i] = self.new_col + + def time_frame_insert_100_columns_begin(self): + self.f() + + def time_frame_insert_500_columns_end(self): + self.g() + +#---------------------------------------------------------------------- +# strings methods, #2602 + class series_string_vector_slice(object): goal_time = 0.2 @@ -1003,15 +615,17 @@ def time_series_string_vector_slice(self): self.s.str[:5] -class frame_quantile_axis1(object): +#---------------------------------------------------------------------- +# df.info() and get_dtype_counts() # 2807 + +class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + self.df = DataFrame(np.random.randn(10, 10000)) - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) + def time_frame_get_dtype_counts(self): + self.df.get_dtype_counts() class frame_nlargest(object): diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 1c82560c7e630..3f53894364cd2 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -22,7 +22,7 @@ def wrapper(fname): return wrapper -class nogil_groupby_base(object): +class NoGilGroupby(object): goal_time = 0.2 def setup(self): @@ -30,167 +30,122 @@ def setup(self): self.ngroups = 1000 np.random.seed(1234) self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError - -class nogil_groupby_count_2(nogil_groupby_base): + np.random.seed(1234) + self.size = 2 ** 22 + self.ngroups = 100 + self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) - def time_nogil_groupby_count_2(self): - self.pg2() + if (not have_real_test_parallel): + raise NotImplementedError @test_parallel(num_threads=2) - def pg2(self): + def _pg2_count(self): self.df.groupby('key')['data'].count() - -class nogil_groupby_last_2(nogil_groupby_base): - - def time_nogil_groupby_last_2(self): - self.pg2() + def time_count_2(self): + self._pg2_count() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_last(self): self.df.groupby('key')['data'].last() - -class nogil_groupby_max_2(nogil_groupby_base): - - def time_nogil_groupby_max_2(self): - self.pg2() + def time_last_2(self): + self._pg2_last() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_max(self): self.df.groupby('key')['data'].max() - -class nogil_groupby_mean_2(nogil_groupby_base): - - def time_nogil_groupby_mean_2(self): - self.pg2() + def time_max_2(self): + self._pg2_max() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_mean(self): self.df.groupby('key')['data'].mean() - -class nogil_groupby_min_2(nogil_groupby_base): - - def time_nogil_groupby_min_2(self): - self.pg2() + def time_mean_2(self): + self._pg2_mean() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_min(self): self.df.groupby('key')['data'].min() - -class nogil_groupby_prod_2(nogil_groupby_base): - - def time_nogil_groupby_prod_2(self): - self.pg2() + def time_min_2(self): + self._pg2_min() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_prod(self): self.df.groupby('key')['data'].prod() - -class nogil_groupby_sum_2(nogil_groupby_base): - - def time_nogil_groupby_sum_2(self): - self.pg2() + def time_prod_2(self): + self._pg2_prod() @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].sum() - - -class nogil_groupby_sum_4(nogil_groupby_base): - - def time_nogil_groupby_sum_4(self): - self.pg4() - - def f(self): + def _pg2_sum(self): self.df.groupby('key')['data'].sum() - def g4(self): - for i in range(4): - self.f() + def time_sum_2(self): + self._pg2_sum() @test_parallel(num_threads=4) - def pg4(self): - self.f() - + def _pg4_sum(self): + self.df.groupby('key')['data'].sum() -class nogil_groupby_sum_8(nogil_groupby_base): + def time_sum_4(self): + self._pg4_sum() - def time_nogil_groupby_sum_8(self): - self.pg8() + def time_sum_4_notp(self): + for i in range(4): + self.df.groupby('key')['data'].sum() - def f(self): + def _f_sum(self): self.df.groupby('key')['data'].sum() - def g8(self): - for i in range(8): - self.f() - @test_parallel(num_threads=8) - def pg8(self): - self.f() - + def _pg8_sum(self): + self._f_sum() -class nogil_groupby_var_2(nogil_groupby_base): + def time_sum_8(self): + self._pg8_sum() - def time_nogil_groupby_var_2(self): - self.pg2() + def time_sum_8_notp(self): + for i in range(8): + self._f_sum() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_var(self): self.df.groupby('key')['data'].var() + def time_var_2(self): + self._pg2_var() -class nogil_groupby_groups(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.size = 2**22 - self.ngroups = 100 - self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) - if (not have_real_test_parallel): - raise NotImplementedError + # get groups - def f(self): + def _groups(self): self.data.groupby(self.data).groups - -class nogil_groupby_groups_2(nogil_groupby_groups): - - def time_nogil_groupby_groups(self): - self.pg2() - @test_parallel(num_threads=2) - def pg2(self): - self.f() - + def _pg2_groups(self): + self._groups() -class nogil_groupby_groups_4(nogil_groupby_groups): - - def time_nogil_groupby_groups(self): - self.pg4() + def time_groups_2(self): + self._pg2_groups() @test_parallel(num_threads=4) - def pg4(self): - self.f() + def _pg4_groups(self): + self._groups() + def time_groups_4(self): + self._pg4_groups() -class nogil_groupby_groups_8(nogil_groupby_groups): + @test_parallel(num_threads=8) + def _pg8_groups(self): + self._groups() - def time_nogil_groupby_groups(self): - self.pg8() + def time_groups_8(self): + self._pg8_groups() - @test_parallel(num_threads=8) - def pg8(self): - self.f() class nogil_take1d_float64(object): @@ -408,19 +363,19 @@ def create_cols(self, name): def pg_read_csv(self): read_csv('__test__.csv', sep=',', header=None, float_precision=None) - def time_nogil_read_csv(self): + def time_read_csv(self): self.pg_read_csv() @test_parallel(num_threads=2) def pg_read_csv_object(self): read_csv('__test_object__.csv', sep=',') - def time_nogil_read_csv_object(self): + def time_read_csv_object(self): self.pg_read_csv_object() @test_parallel(num_threads=2) def pg_read_csv_datetime(self): read_csv('__test_datetime__.csv', sep=',', header=None) - def time_nogil_read_csv_datetime(self): + def time_read_csv_datetime(self): self.pg_read_csv_datetime() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5f3671012e6d5..ad58cd0fc6d70 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -35,107 +35,67 @@ def time_groupby_apply_dict_return(self): #---------------------------------------------------------------------- # groups -class groupby_groups(object): +class Groups(object): goal_time = 0.1 - def setup(self): - size = 2**22 - self.data = Series(np.random.randint(0, 100, size=size)) - self.data2 = Series(np.random.randint(0, 10000, size=size)) - self.data3 = Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))) - self.data4 = Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) - - def time_groupby_groups_int64_small(self): - self.data.groupby(self.data).groups + size = 2 ** 22 + data = { + 'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large' : Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))), + 'object_large': Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) + } - def time_groupby_groups_int64_large(self): - self.data2.groupby(self.data2).groups + param_names = ['df'] + params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def time_groupby_groups_object_small(self): - self.data3.groupby(self.data3).groups + def setup(self, df): + self.df = self.data[df] - def time_groupby_groups_object_large(self): - self.data4.groupby(self.data4).groups + def time_groupby_groups(self, df): + self.df.groupby(self.df).groups #---------------------------------------------------------------------- # First / last functions -class groupby_first_last(object): - goal_time = 0.2 - - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) - - def time_groupby_first_float32(self): - self.data2.groupby(self.labels).first() - - def time_groupby_first_float64(self): - self.data.groupby(self.labels).first() - - def time_groupby_last_float32(self): - self.data2.groupby(self.labels).last() - - def time_groupby_last_float64(self): - self.data.groupby(self.labels).last() - - def time_groupby_nth_float32_any(self): - self.data2.groupby(self.labels).nth(0, dropna='all') - - def time_groupby_nth_float32_none(self): - self.data2.groupby(self.labels).nth(0) - - def time_groupby_nth_float64_any(self): - self.data.groupby(self.labels).nth(0, dropna='all') - - def time_groupby_nth_float64_none(self): - self.data.groupby(self.labels).nth(0) - -# with datetimes (GH7555) - -class groupby_first_last_datetimes(object): +class FirstLast(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) - - def time_groupby_first_datetimes(self): - self.df.groupby('b').first() + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def time_groupby_last_datetimes(self): - self.df.groupby('b').last() + # with datetimes (GH7555) - def time_groupby_nth_datetimes_any(self): - self.df.groupby('b').nth(0, dropna='all') + def setup(self, dtype): - def time_groupby_nth_datetimes_none(self): - self.df.groupby('b').nth(0) - - -class groupby_first_last_object(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000)}) + if dtype == 'datetime': + self.df = DataFrame( + {'values': date_range('1/1/2011', periods=100000, freq='s'), + 'key': range(100000),}) + elif dtype == 'object': + self.df = DataFrame( + {'values': (['foo'] * 100000), + 'key': range(100000)}) + else: + labels = np.arange(10000).repeat(10) + data = Series(randn(len(labels)), dtype=dtype) + data[::3] = np.nan + data[1::3] = np.nan + labels = labels.take(np.random.permutation(len(labels))) + self.df = DataFrame({'values': data, 'key': labels}) - def time_groupby_first_object(self): - self.df.groupby('b').first() + def time_groupby_first(self, dtype): + self.df.groupby('key').first() - def time_groupby_last_object(self): - self.df.groupby('b').last() + def time_groupby_last(self, dtype): + self.df.groupby('key').last() - def time_groupby_nth_object_any(self): - self.df.groupby('b').nth(0, dropna='any') + def time_groupby_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - def time_groupby_nth_object_none(self): - self.df.groupby('b').nth(0) + def time_groupby_nth_none(self, dtype): + self.df.groupby('key').nth(0) #---------------------------------------------------------------------- @@ -189,24 +149,6 @@ def time_sum(self): self.df.groupby(self.labels).sum() -#---------------------------------------------------------------------- -# median - -class groupby_frame(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(100000, 2) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) - - def time_groupby_frame_median(self): - self.df.groupby(self.labels).median() - - def time_groupby_simple_compress_timing(self): - self.df.groupby(self.labels).mean() - - #---------------------------------------------------------------------- # DataFrame nth @@ -405,132 +347,118 @@ def time_groupby_dt_timegrouper_size(self): #---------------------------------------------------------------------- # groupby with a variable value for ngroups -class groupby_ngroups_int_10000(object): +class GroupBySuite(object): goal_time = 0.2 - dtype = 'int' - ngroups = 10000 - def setup(self): + param_names = ['dtype', 'ngroups'] + params = [['int', 'float'], [100, 10000]] + + def setup(self, dtype, ngroups): np.random.seed(1234) - size = self.ngroups * 2 - rng = np.arange(self.ngroups) - ts = rng.take(np.random.randint(0, self.ngroups, size=size)) - if self.dtype == 'int': - value = np.random.randint(0, size, size=size) + size = ngroups * 2 + rng = np.arange(ngroups) + values = rng.take(np.random.randint(0, ngroups, size=size)) + if dtype == 'int': + key = np.random.randint(0, size, size=size) else: - value = np.concatenate([np.random.random(self.ngroups) * 0.1, - np.random.random(self.ngroups) * 10.0]) + key = np.concatenate([np.random.random(ngroups) * 0.1, + np.random.random(ngroups) * 10.0]) - self.df = DataFrame({'timestamp': ts, - 'value': value}) + self.df = DataFrame({'values': values, + 'key': key}) - def time_all(self): - self.df.groupby('value')['timestamp'].all() + def time_all(self, dtype, ngroups): + self.df.groupby('key')['values'].all() - def time_any(self): - self.df.groupby('value')['timestamp'].any() + def time_any(self, dtype, ngroups): + self.df.groupby('key')['values'].any() - def time_count(self): - self.df.groupby('value')['timestamp'].count() + def time_count(self, dtype, ngroups): + self.df.groupby('key')['values'].count() - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() + def time_cumcount(self, dtype, ngroups): + self.df.groupby('key')['values'].cumcount() - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() + def time_cummax(self, dtype, ngroups): + self.df.groupby('key')['values'].cummax() - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() + def time_cummin(self, dtype, ngroups): + self.df.groupby('key')['values'].cummin() - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() + def time_cumprod(self, dtype, ngroups): + self.df.groupby('key')['values'].cumprod() - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() + def time_cumsum(self, dtype, ngroups): + self.df.groupby('key')['values'].cumsum() - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() + def time_describe(self, dtype, ngroups): + self.df.groupby('key')['values'].describe() - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() + def time_diff(self, dtype, ngroups): + self.df.groupby('key')['values'].diff() - def time_first(self): - self.df.groupby('value')['timestamp'].first() + def time_first(self, dtype, ngroups): + self.df.groupby('key')['values'].first() - def time_head(self): - self.df.groupby('value')['timestamp'].head() + def time_head(self, dtype, ngroups): + self.df.groupby('key')['values'].head() - def time_last(self): - self.df.groupby('value')['timestamp'].last() + def time_last(self, dtype, ngroups): + self.df.groupby('key')['values'].last() - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() + def time_mad(self, dtype, ngroups): + self.df.groupby('key')['values'].mad() - def time_max(self): - self.df.groupby('value')['timestamp'].max() + def time_max(self, dtype, ngroups): + self.df.groupby('key')['values'].max() - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() + def time_mean(self, dtype, ngroups): + self.df.groupby('key')['values'].mean() - def time_median(self): - self.df.groupby('value')['timestamp'].median() + def time_median(self, dtype, ngroups): + self.df.groupby('key')['values'].median() - def time_min(self): - self.df.groupby('value')['timestamp'].min() + def time_min(self, dtype, ngroups): + self.df.groupby('key')['values'].min() - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() + def time_nunique(self, dtype, ngroups): + self.df.groupby('key')['values'].nunique() - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() + def time_pct_change(self, dtype, ngroups): + self.df.groupby('key')['values'].pct_change() - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() + def time_prod(self, dtype, ngroups): + self.df.groupby('key')['values'].prod() - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() + def time_rank(self, dtype, ngroups): + self.df.groupby('key')['values'].rank() - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() + def time_sem(self, dtype, ngroups): + self.df.groupby('key')['values'].sem() - def time_size(self): - self.df.groupby('value')['timestamp'].size() + def time_size(self, dtype, ngroups): + self.df.groupby('key')['values'].size() - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() + def time_skew(self, dtype, ngroups): + self.df.groupby('key')['values'].skew() - def time_std(self): - self.df.groupby('value')['timestamp'].std() - - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() + def time_std(self, dtype, ngroups): + self.df.groupby('key')['values'].std() - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() + def time_sum(self, dtype, ngroups): + self.df.groupby('key')['values'].sum() - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() + def time_tail(self, dtype, ngroups): + self.df.groupby('key')['values'].tail() - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def time_unique(self, dtype, ngroups): + self.df.groupby('key')['values'].unique() - def time_var(self): - self.df.groupby('value')['timestamp'].var() - -class groupby_ngroups_int_100(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'int' - ngroups = 100 - -class groupby_ngroups_float_100(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'float' - ngroups = 100 + def time_value_counts(self, dtype, ngroups): + self.df.groupby('key')['values'].value_counts() -class groupby_ngroups_float_10000(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'float' - ngroups = 10000 + def time_var(self, dtype, ngroups): + self.df.groupby('key')['values'].var() class groupby_float32(object): @@ -647,89 +575,75 @@ def time_groupby_sum_multiindex(self): #------------------------------------------------------------------------------- # Transform testing -class groupby_transform(object): +class Transform(object): goal_time = 0.2 def setup(self): - self.n_dates = 400 - self.n_securities = 250 - self.n_columns = 3 - self.share_na = 0.1 - self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') - self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) - self.secid_min = int('10000000', 16) - self.secid_max = int('F0000000', 16) - self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) - self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step))) - self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], - labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)], - names=['date', 'security_id']) - self.n_data = len(self.data_index) - self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) - self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) - self.step = int((self.n_data * self.share_na)) - for column_index in range(self.n_columns): - self.index = column_index - while (self.index < self.n_data): - self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) - self.index += self.step - self.f_fillna = (lambda x: x.fillna(method='pad')) - - def time_groupby_transform(self): - self.data.groupby(level='security_id').transform(self.f_fillna) + n1 = 400 + n2 = 250 - def time_groupby_transform_ufunc(self): - self.data.groupby(level='date').transform(np.max) + index = MultiIndex( + levels=[np.arange(n1), pd.util.testing.makeStringIndex(n2)], + labels=[[i for i in range(n1) for _ in range(n2)], + (list(range(n2)) * n1)], + names=['lev1', 'lev2']) + data = DataFrame(np.random.randn(n1 * n2, 3), + index=index, columns=['col1', 'col20', 'col3']) + step = int((n1 * n2 * 0.1)) + for col in range(len(data.columns)): + idx = col + while (idx < len(data)): + data.set_value(data.index[idx], data.columns[col], np.nan) + idx += step + self.df = data + self.f_fillna = (lambda x: x.fillna(method='pad')) -class groupby_transform_multi_key(object): - goal_time = 0.2 - - def setup(self): np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) + n = 20000 + self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df2 = self.df1.copy() + self.df2['jim'] = self.df2['joe'] - def time_groupby_transform_multi_key1(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df4 = self.df3.copy() + self.df4['jim'] = self.df4['joe'] + def time_transform_func(self): + self.df.groupby(level='lev2').transform(self.f_fillna) -class groupby_transform_multi_key2(object): - goal_time = 0.2 + def time_transform_ufunc(self): + self.df.groupby(level='lev1').transform(np.max) - def setup(self): - np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] + def time_transform_multi_key1(self): + self.df1.groupby(['jim', 'joe'])['jolie'].transform('max') - def time_groupby_transform_multi_key2(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key2(self): + self.df2.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key3(self): + self.df3.groupby(['jim', 'joe'])['jolie'].transform('max') -class groupby_transform_multi_key3(object): - goal_time = 0.2 + def time_transform_multi_key4(self): + self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - def time_groupby_transform_multi_key3(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') -class groupby_transform_multi_key4(object): - goal_time = 0.2 +np.random.seed(0) +N = 120000 +N_TRANSITIONS = 1400 +transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] +transition_points.sort() +transitions = np.zeros((N,), dtype=np.bool) +transitions[transition_points] = True +g = transitions.cumsum() +df = DataFrame({'signal': np.random.rand(N), }) + - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] - def time_groupby_transform_multi_key4(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') class groupby_transform_series(object): @@ -737,14 +651,12 @@ class groupby_transform_series(object): def setup(self): np.random.seed(0) - self.N = 120000 - self.N_TRANSITIONS = 1400 - self.transition_points = np.random.permutation(np.arange(self.N))[:self.N_TRANSITIONS] - self.transition_points.sort() - self.transitions = np.zeros((self.N,), dtype=np.bool) - self.transitions[self.transition_points] = True - self.g = self.transitions.cumsum() - self.df = DataFrame({'signal': np.random.rand(self.N), }) + N = 120000 + transition_points = np.sort(np.random.choice(np.arange(N), 1400)) + transitions = np.zeros((N,), dtype=np.bool) + transitions[transition_points] = True + self.g = transitions.cumsum() + self.df = DataFrame({'signal': np.random.rand(N)}) def time_groupby_transform_series(self): self.df['signal'].groupby(self.g).transform(np.mean) @@ -755,38 +667,29 @@ class groupby_transform_series2(object): def setup(self): np.random.seed(0) - self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + self.df = DataFrame({'key': (np.arange(100000) // 3), + 'val': np.random.randn(100000)}) - def time_groupby_transform_series2(self): - self.df.groupby('id')['val'].transform(np.mean) + self.df_nans = pd.DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.ix[4::10, 'B':'C'] = 5 + def time_transform_series2(self): + self.df.groupby('key')['val'].transform(np.mean) -class groupby_transform_dataframe(object): - # GH 12737 - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df.ix[4::10, 'B':'C'] = 5 - - def time_groupby_transform_dataframe(self): - self.df.groupby('group').transform('first') + def time_cumprod(self): + self.df.groupby('key').cumprod() + def time_cumsum(self): + self.df.groupby('key').cumsum() -class groupby_transform_cythonized(object): - goal_time = 0.2 + def time_shift(self): + self.df.groupby('key').shift() - def setup(self): - np.random.seed(0) - self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + def time_transform_dataframe(self): + # GH 12737 + self.df_nans.groupby('key').transform('first') - def time_groupby_transform_cumprod(self): - self.df.groupby('id').cumprod() - def time_groupby_transform_cumsum(self): - self.df.groupby('id').cumsum() - def time_groupby_transform_shift(self): - self.df.groupby('id').shift() diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 659fc4941da54..78de5267a2969 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -2,186 +2,45 @@ import os -class query_store_table(object): +class HDF5(object): goal_time = 0.2 def setup(self): - self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df12', self.df) - - def time_query_store_table(self): - self.store.select('df12', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),}, + index=self.index) - def teardown(self): - self.store.close() + self.df_mixed = DataFrame( + {'float1': randn(25000), 'float2': randn(25000), + 'string1': (['foo'] * 25000), + 'bool1': ([True] * 25000), + 'int1': np.random.randint(0, 250000, size=25000),}, + index=self.index) - def remove(self, f): - try: - os.remove(self.f) - except: - pass + self.df_wide = DataFrame(np.random.randn(25000, 100)) + self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)}, + index=date_range('1/1/2000', periods=25000)) + self.df_wide2 = DataFrame(np.random.randn(25000, 100), + index=date_range('1/1/2000', periods=25000)) -class query_store_table_wide(object): - goal_time = 0.2 + self.df_dc = DataFrame(np.random.randn(10000, 10), + columns=[('C%03d' % i) for i in range(10)]) - def setup(self): self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame(np.random.randn(25000, 100), index=self.index) self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df11', self.df) - def time_query_store_table_wide(self): - self.store.select('df11', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) self.store = HDFStore(self.f) self.store.put('df1', self.df) + self.store.put('df_mixed', self.df_mixed) - def time_read_store(self): - self.store.get('df1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.put('df3', self.df) - - def time_read_store_mixed(self): - self.store.get('df3') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + self.store.append('df5', self.df_mixed) self.store.append('df7', self.df) - def time_read_store_table(self): - self.store.select('df7') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.N = 10000 - self.index = tm.makeStringIndex(self.N) - self.df = DataFrame({'float1': randn(self.N), 'float2': randn(self.N), 'string1': (['foo'] * self.N), 'bool1': ([True] * self.N), 'int1': np.random.randint(0, self.N, size=self.N), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df5', self.df) - - def time_read_store_table_mixed(self): - self.store.select('df5') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('p1', self.p) - - def time_read_store_table_panel(self): - self.store.select('p1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df9', self.df) + self.store.append('df9', self.df_wide) - def time_read_store_table_wide(self): - self.store.select('df9') + self.store.append('df11', self.df_wide2) + self.store.append('df12', self.df2) def teardown(self): self.store.close() @@ -192,110 +51,60 @@ def remove(self, f): except: pass + def time_read_store(self): + self.store.get('df1') -class write_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_mixed(self): + self.store.get('df_mixed') def time_write_store(self): self.store.put('df2', self.df) - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - def time_write_store_mixed(self): - self.store.put('df4', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass + self.store.put('df_mixed2', self.df_mixed) + def time_read_store_table_mixed(self): + self.store.select('df5') -class write_store_table(object): - goal_time = 0.2 + def time_write_store_table_mixed(self): + self.store.append('df6', self.df_mixed) - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_table(self): + self.store.select('df7') def time_write_store_table(self): self.store.append('df8', self.df) - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_dc(object): - goal_time = 0.2 + def time_read_store_table_wide(self): + self.store.select('df9') - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(10000, 10), columns=[('C%03d' % i) for i in range(10)]) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_write_store_table_wide(self): + self.store.append('df10', self.df_wide) def time_write_store_table_dc(self): self.store.append('df15', self.df, data_columns=True) - def teardown(self): - self.store.close() + def time_query_store_table_wide(self): + self.store.select('df11', [('index', '>', self.df_wide2.index[10000]), + ('index', '<', self.df_wide2.index[15000])]) - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_query_store_table(self): + self.store.select('df12', [('index', '>', self.df2.index[10000]), + ('index', '<', self.df2.index[15000])]) -class write_store_table_mixed(object): +class HDF5Panel(object): goal_time = 0.2 def setup(self): self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 25000, size=25000), }, index=self.index) + self.p = Panel(randn(20, 1000, 25), + items=[('Item%03d' % i) for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=[('E%03d' % i) for i in range(25)]) self.remove(self.f) self.store = HDFStore(self.f) - - def time_write_store_table_mixed(self): - self.store.append('df6', self.df) + self.store.append('p1', self.p) def teardown(self): self.store.close() @@ -306,46 +115,8 @@ def remove(self, f): except: pass - -class write_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_table_panel(self): + self.store.select('p1') def time_write_store_table_panel(self): self.store.append('p2', self.p) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_wide(self): - self.store.append('df10', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 2c94f9b2b1e8c..3fb53ce9b3c98 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,102 +1,93 @@ from .pandas_vb_common import * -class datetime_index_intersection(object): +class SetOperations(object): goal_time = 0.2 def setup(self): self.rng = date_range('1/1/2000', periods=10000, freq='T') self.rng2 = self.rng[:(-1)] - def time_datetime_index_intersection(self): + # object index with datetime values + if (self.rng.dtype == object): + self.idx_rng = self.rng.view(Index) + else: + self.idx_rng = self.rng.asobject + self.idx_rng2 = self.idx_rng[:(-1)] + + # other datetime + N = 100000 + A = N - 20000 + B = N + 20000 + self.dtidx1 = DatetimeIndex(range(N)) + self.dtidx2 = DatetimeIndex(range(A, B)) + self.dtidx3 = DatetimeIndex(range(N, B)) + + # integer + self.N = 1000000 + self.options = np.arange(self.N) + self.left = Index( + self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index( + self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + + # strings + N = 10000 + strs = tm.rands_array(10, N) + self.leftstr = Index(strs[:N * 2 // 3]) + self.rightstr = Index(strs[N // 3:]) + + def time_datetime_intersection(self): self.rng.intersection(self.rng2) - -class datetime_index_repr(object): - goal_time = 0.2 - - def setup(self): - self.dr = pd.date_range('20000101', freq='D', periods=100000) - - def time_datetime_index_repr(self): - self.dr._is_dates_only - - -class datetime_index_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] - - def time_datetime_index_union(self): + def time_datetime_union(self): self.rng.union(self.rng2) + def time_datetime_difference(self): + self.dtidx1.difference(self.dtidx2) -class index_datetime_intersection(object): - goal_time = 0.2 + def time_datetime_difference_disjoint(self): + self.dtidx1.difference(self.dtidx3) - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] + def time_datetime_symmetric_difference(self): + self.dtidx1.symmetric_difference(self.dtidx2) def time_index_datetime_intersection(self): - self.rng.intersection(self.rng2) - - -class index_datetime_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] + self.idx_rng.intersection(self.idx_rng2) def time_index_datetime_union(self): - self.rng.union(self.rng2) + self.idx_rng.union(self.idx_rng2) + def time_int64_intersection(self): + self.left.intersection(self.right) -class index_datetime_set_difference(object): - goal_time = 0.2 + def time_int64_union(self): + self.left.union(self.right) - def setup(self): - self.N = 100000 - self.A = self.N - 20000 - self.B = self.N + 20000 - self.idx1 = DatetimeIndex(range(self.N)) - self.idx2 = DatetimeIndex(range(self.A, self.B)) - self.idx3 = DatetimeIndex(range(self.N, self.B)) + def time_int64_difference(self): + self.left.difference(self.right) - def time_index_datetime_difference(self): - self.idx1.difference(self.idx2) + def time_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) - def time_index_datetime_difference_disjoint(self): - self.idx1.difference(self.idx3) + def time_str_difference(self): + self.leftstr.difference(self.rightstr) - def time_index_datetime_symmetric_difference(self): - self.idx1.symmetric_difference(self.idx2) + def time_str_symmetric_difference(self): + self.leftstr.symmetric_difference(self.rightstr) -class index_float64_boolean_indexer(object): +class Datetime(object): goal_time = 0.2 def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + self.dr = pd.date_range('20000101', freq='D', periods=10000) - def time_index_float64_boolean_indexer(self): - self.idx[self.mask] + def time_is_dates_only(self): + self.dr._is_dates_only -class index_float64_boolean_series_indexer(object): +class Float64(object): goal_time = 0.2 def setup(self): @@ -104,141 +95,34 @@ def setup(self): self.mask = ((np.arange(self.idx.size) % 3) == 0) self.series_mask = Series(self.mask) - def time_index_float64_boolean_series_indexer(self): - self.idx[self.series_mask] - - -class index_float64_construct(object): - goal_time = 0.2 - - def setup(self): self.baseidx = np.arange(1000000.0) - def time_index_float64_construct(self): - Index(self.baseidx) - + def time_boolean_indexer(self): + self.idx[self.mask] -class index_float64_div(object): - goal_time = 0.2 + def time_boolean_series_indexer(self): + self.idx[self.series_mask] - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + def time_construct(self): + Index(self.baseidx) - def time_index_float64_div(self): + def time_div(self): (self.idx / 2) - -class index_float64_get(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_get(self): + def time_get(self): self.idx[1] - -class index_float64_mul(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_mul(self): + def time_mul(self): (self.idx * 2) - -class index_float64_slice_indexer_basic(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_slice_indexer_basic(self): + def time_slice_indexer_basic(self): self.idx[:(-1)] - -class index_float64_slice_indexer_even(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_slice_indexer_even(self): + def time_slice_indexer_even(self): self.idx[::2] -class index_int64_intersection(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_intersection(self): - self.left.intersection(self.right) - - -class index_int64_union(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_union(self): - self.left.union(self.right) - - -class index_int64_set_difference(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.options = np.arange(self.N) - self.left = Index(self.options.take( - np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take( - np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_difference(self): - self.left.difference(self.right) - - def time_index_int64_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - -class index_str_set_difference(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.strs = tm.rands_array(10, self.N) - self.left = Index(self.strs[:self.N * 2 // 3]) - self.right = Index(self.strs[self.N // 3:]) - - def time_str_difference(self): - self.left.difference(self.right) - - def time_str_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - -class index_str_boolean_indexer(object): +class StringIndex(object): goal_time = 0.2 def setup(self): @@ -246,47 +130,20 @@ def setup(self): self.mask = ((np.arange(1000000) % 3) == 0) self.series_mask = Series(self.mask) - def time_index_str_boolean_indexer(self): + def time_boolean_indexer(self): self.idx[self.mask] - -class index_str_boolean_series_indexer(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_boolean_series_indexer(self): + def time_boolean_series_indexer(self): self.idx[self.series_mask] - -class index_str_slice_indexer_basic(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_slice_indexer_basic(self): + def time_slice_indexer_basic(self): self.idx[:(-1)] - -class index_str_slice_indexer_even(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_slice_indexer_even(self): + def time_slice_indexer_even(self): self.idx[::2] -class multiindex_duplicated(object): +class Multi1(object): goal_time = 0.2 def setup(self): @@ -295,21 +152,16 @@ def setup(self): self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] self.mi = MultiIndex(levels=self.levels, labels=self.labels) - def time_multiindex_duplicated(self): - self.mi.duplicated() - - -class multiindex_from_product(object): - goal_time = 0.2 - - def setup(self): self.iterables = [tm.makeStringIndex(10000), range(20)] - def time_multiindex_from_product(self): + def time_duplicated(self): + self.mi.duplicated() + + def time_from_product(self): MultiIndex.from_product(self.iterables) -class multiindex_sortlevel_int64(object): +class Multi2(object): goal_time = 0.2 def setup(self): @@ -319,23 +171,22 @@ def setup(self): self.i = np.random.permutation(self.n) self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] - def time_multiindex_sortlevel_int64(self): + self.a = np.repeat(np.arange(100), 1000) + self.b = np.tile(np.arange(1000), 100) + self.midx2 = MultiIndex.from_arrays([self.a, self.b]) + self.midx2 = self.midx2.take(np.random.permutation(np.arange(100000))) + + def time_sortlevel_int64(self): self.mi.sortlevel() + def time_sortlevel_zero(self): + self.midx2.sortlevel(0) -class multiindex_with_datetime_level_full(object): - goal_time = 0.2 + def time_sortlevel_one(self): + self.midx2.sortlevel(1) - def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) - def time_multiindex_with_datetime_level_full(self): - self.mi.copy().values - - -class multiindex_with_datetime_level_sliced(object): +class Multi3(object): goal_time = 0.2 def setup(self): @@ -343,5 +194,8 @@ def setup(self): self.level2 = date_range(start='1/1/2012', periods=100) self.mi = MultiIndex.from_product([self.level1, self.level2]) - def time_multiindex_with_datetime_level_sliced(self): + def time_datetime_level_values_full(self): + self.mi.copy().values + + def time_datetime_level_values_sliced(self): self.mi[:10].values diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 094ae23a92fad..27cd320c661e0 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -5,204 +5,171 @@ expr = None -class dataframe_getitem_scalar(object): +class Int64Indexing(object): goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_dataframe_getitem_scalar(self): - self.df[self.col][self.idx] + self.s = Series(np.random.rand(1000000)) + def time_getitem_scalar(self): + self.s[800000] -class series_get_value(object): - goal_time = 0.2 + def time_getitem_slice(self): + self.s[:800000] - def setup(self): - self.index = tm.makeStringIndex(1000) - self.s = Series(np.random.rand(1000), index=self.index) - self.idx = self.index[100] + def time_getitem_list_like(self): + self.s[[800000]] - def time_series_get_value(self): - self.s.get_value(self.idx) + def time_getitem_array(self): + self.s[np.arange(10000)] + def time_iloc_array(self): + self.s.iloc[np.arange(10000)] -class time_series_getitem_scalar(object): - goal_time = 0.2 + def time_iloc_list_like(self): + self.s.iloc[[800000]] - def setup(self): - tm.N = 1000 - self.ts = tm.makeTimeSeries() - self.dt = self.ts.index[500] + def time_iloc_scalar(self): + self.s.iloc[800000] - def time_time_series_getitem_scalar(self): - self.ts[self.dt] + def time_iloc_slice(self): + self.s.iloc[:800000] + def time_ix_array(self): + self.s.ix[np.arange(10000)] -class frame_iloc_big(object): - goal_time = 0.2 + def time_ix_list_like(self): + self.s.ix[[800000]] - def setup(self): - self.df = DataFrame(dict(A=(['foo'] * 1000000))) + def time_ix_scalar(self): + self.s.ix[800000] - def time_frame_iloc_big(self): - self.df.iloc[:100, 0] + def time_ix_slice(self): + self.s.ix[:800000] + def time_loc_array(self): + self.s.loc[np.arange(10000)] -class frame_iloc_dups(object): - goal_time = 0.2 + def time_loc_list_like(self): + self.s.loc[[800000]] - def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) + def time_loc_scalar(self): + self.s.loc[800000] - def time_frame_iloc_dups(self): - self.df2.iloc[self.idx] + def time_loc_slice(self): + self.s.loc[:800000] -class frame_loc_dups(object): +class StringIndexing(object): goal_time = 0.2 def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) - - def time_frame_loc_dups(self): - self.df2.loc[self.idx] - + self.index = tm.makeStringIndex(1000000) + self.s = Series(np.random.rand(1000000), index=self.index) + self.lbl = self.s.index[800000] -class frame_xs_mi_ix(object): - goal_time = 0.2 + def time_getitem_label_slice(self): + self.s[:self.lbl] - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) - self.df = DataFrame(self.s) + def time_getitem_pos_slice(self): + self.s[:800000] - def time_frame_xs_mi_ix(self): - self.df.ix[999] + def time_get_value(self): + self.s.get_value(self.lbl) -class indexing_dataframe_boolean(object): +class DatetimeIndexing(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - - def time_indexing_dataframe_boolean(self): - (self.df > self.df2) + tm.N = 1000 + self.ts = tm.makeTimeSeries() + self.dt = self.ts.index[500] + def time_getitem_scalar(self): + self.ts[self.dt] + -class indexing_dataframe_boolean_no_ne(object): +class DataFrameIndexing(object): goal_time = 0.2 def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_use_numexpr(False) - - def time_indexing_dataframe_boolean_no_ne(self): - (self.df > self.df2) - - def teardown(self): - expr.set_use_numexpr(True) - - -class indexing_dataframe_boolean_rows(object): - goal_time = 0.2 + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=self.index, + columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] - def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) + self.df2 = DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) + self.indexer = (self.df2['B'] > 0) self.obj_indexer = self.indexer.astype('O') - def time_indexing_dataframe_boolean_rows(self): - self.df[self.indexer] + # duptes + self.idx_dupe = (np.array(range(30)) * 99) + self.df3 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000),}) + self.df3 = concat([self.df3, (2 * self.df3), (3 * self.df3)]) + self.df_big = DataFrame(dict(A=(['foo'] * 1000000))) -class indexing_dataframe_boolean_rows_object(object): - goal_time = 0.2 + def time_get_value(self): + self.df.get_value(self.idx, self.col) - def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) - self.obj_indexer = self.indexer.astype('O') + def time_get_value_ix(self): + self.df.ix[(self.idx, self.col)] - def time_indexing_dataframe_boolean_rows_object(self): - self.df[self.obj_indexer] + def time_getitem_scalar(self): + self.df[self.col][self.idx] + def time_boolean_rows(self): + self.df2[self.indexer] -class indexing_dataframe_boolean_st(object): - goal_time = 0.2 + def time_boolean_rows_object(self): + self.df2[self.obj_indexer] - def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_numexpr_threads(1) + def time_iloc_dups(self): + self.df3.iloc[self.idx_dupe] - def time_indexing_dataframe_boolean_st(self): - (self.df > self.df2) + def time_loc_dups(self): + self.df3.loc[self.idx_dupe] - def teardown(self): - expr.set_numexpr_threads() + def time_iloc_big(self): + self.df_big.iloc[:100, 0] -class indexing_frame_get_value(object): +class IndexingMethods(object): + # GH 13166 goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_indexing_frame_get_value(self): - self.df.get_value(self.idx, self.col) + a = np.arange(100000) + self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + self.s = Series(np.random.rand(100000)) + self.ts = Series(np.random.rand(100000), + index=date_range('2011-01-01', freq='S', periods=100000)) + self.indexer = ([True, False, True, True, False] * 20000) -class indexing_frame_get_value_ix(object): - goal_time = 0.2 + def time_get_loc_float(self): + self.ind.get_loc(0) - def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] + def time_take_dtindex(self): + self.ts.take(self.indexer) - def time_indexing_frame_get_value_ix(self): - self.df.ix[(self.idx, self.col)] + def time_take_intindex(self): + self.s.take(self.indexer) -class indexing_panel_subset(object): +class MultiIndexing(object): goal_time = 0.2 def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) - - def time_indexing_panel_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] - - -class multiindex_slicers(object): - goal_time = 0.2 + self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) + self.s = Series(np.random.randn(1000000), index=self.mi) + self.df = DataFrame(self.s) - def setup(self): + # slicers np.random.seed(1234) self.idx = pd.IndexSlice self.n = 100000 @@ -223,260 +190,22 @@ def setup(self): self.eps_D = 5000 self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] - - -class series_getitem_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_array(self): - self.s[np.arange(10000)] - - -class series_getitem_label_slice(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - self.lbl = self.s.index[800000] - - def time_series_getitem_label_slice(self): - self.s[:self.lbl] - - -class series_getitem_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_list_like(self): - self.s[[800000]] - - -class series_getitem_pos_slice(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - - def time_series_getitem_pos_slice(self): - self.s[:800000] - - -class series_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_scalar(self): - self.s[800000] - - -class series_getitem_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_slice(self): - self.s[:800000] - - -class series_iloc_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_iloc_array(self): - self.s.iloc[np.arange(10000)] - - -class series_iloc_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_iloc_list_like(self): - self.s.iloc[[800000]] - - -class series_iloc_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_iloc_scalar(self): - self.s.iloc[800000] - - -class series_iloc_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_iloc_slice(self): - self.s.iloc[:800000] - - -class series_ix_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_array(self): - self.s.ix[np.arange(10000)] - - -class series_ix_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_list_like(self): - self.s.ix[[800000]] - - -class series_ix_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_scalar(self): - self.s.ix[800000] - - -class series_ix_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_ix_slice(self): - self.s.ix[:800000] - - -class series_loc_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_array(self): - self.s.loc[np.arange(10000)] - - -class series_loc_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_list_like(self): - self.s.loc[[800000]] - - -class series_loc_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_scalar(self): - self.s.loc[800000] - - -class series_loc_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_loc_slice(self): - self.s.loc[:800000] - - -class series_take_dtindex(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) - - def time_series_take_dtindex(self): - self.ts.take(self.indexer) - - -class series_take_intindex(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) - - def time_series_take_intindex(self): - self.s.take(self.indexer) - - -class series_xs_mi_ix(object): - goal_time = 0.2 - - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) - def time_series_xs_mi_ix(self): self.s.ix[999] + def time_frame_xs_mi_ix(self): + self.df.ix[999] -class sort_level_one(object): - goal_time = 0.2 - - def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) - - def time_sort_level_one(self): - self.midx.sortlevel(1) - - -class sort_level_zero(object): - goal_time = 0.2 - - def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + def time_multiindex_slicers(self): + self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] - def time_sort_level_zero(self): - self.midx.sortlevel(0) -class float_loc(object): - # GH 13166 +class PanelIndexing(object): goal_time = 0.2 def setup(self): - a = np.arange(100000) - self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) - def time_float_loc(self): - self.ind.get_loc(0) + def time_subset(self): + self.p.ix[(self.inds, self.inds, self.inds)] diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 2e394ed4268f3..6eda93c0a1dc8 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -2,143 +2,76 @@ import pandas as pd -class dtype_infer_datetime64(object): +class DtypeInfer(object): goal_time = 0.2 - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) - - -class dtype_infer_float32(object): - goal_time = 0.2 + # from GH 7332 def setup(self): self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_float32(self): - (self.df_float32['A'] + self.df_float32['B']) + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), + B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), + B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), + B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), + B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), + B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), + B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), + B=self.df_datetime64['B'])) + + def time_int64(self): + (self.df_int64['A'] + self.df_int64['B']) + def time_int32(self): + (self.df_int32['A'] + self.df_int32['B']) -class dtype_infer_float64(object): - goal_time = 0.2 + def time_uint32(self): + (self.df_uint32['A'] + self.df_uint32['B']) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_float64(self): + def time_float64(self): (self.df_float64['A'] + self.df_float64['B']) + def time_float32(self): + (self.df_float32['A'] + self.df_float32['B']) -class dtype_infer_int32(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_int32(self): - (self.df_int32['A'] + self.df_int32['B']) - + def time_datetime64(self): + (self.df_datetime64['A'] - self.df_datetime64['B']) -class dtype_infer_int64(object): - goal_time = 0.2 + def time_timedelta64_1(self): + (self.df_timedelta64['A'] + self.df_timedelta64['B']) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_timedelta64_2(self): + (self.df_timedelta64['A'] + self.df_timedelta64['A']) -class dtype_infer_timedelta64_1(object): +class to_numeric(object): goal_time = 0.2 def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) - + self.n = 10000 + self.float = Series(np.random.randn(self.n * 100)) + self.numstr = self.float.astype('str') + self.str = Series(tm.makeStringIndex(self.n)) -class dtype_infer_timedelta64_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) + def time_from_float(self): + pd.to_numeric(self.float) + def time_from_numeric_str(self): + pd.to_numeric(self.numstr) -class dtype_infer_uint32(object): - goal_time = 0.2 + def time_from_str_ignore(self): + pd.to_numeric(self.str, errors='ignore') - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) + def time_from_str_coerce(self): + pd.to_numeric(self.str, errors='coerce') -class to_numeric(object): +class to_numeric_downcast(object): param_names = ['dtype', 'downcast'] params = [['string-float', 'string-int', 'string-nint', 'datetime64', @@ -146,14 +79,15 @@ class to_numeric(object): [None, 'integer', 'signed', 'unsigned', 'float']] N = 500000 + N2 = int(N / 2) data_dict = { - 'string-int': (['1'] * (N // 2)) + ([2] * (N // 2)), - 'string-nint': (['-1'] * (N // 2)) + ([2] * (N // 2)), + 'string-int': (['1'] * N2) + ([2] * N2), + 'string-nint': (['-1'] * N2) + ([2] * N2), 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * (N // 2)) + ([2] * (N // 2)), - 'int-list': ([1] * (N // 2)) + ([2] * (N // 2)), + 'string-float': (['1.1'] * N2) + ([2] * N2), + 'int-list': ([1] * N2) + ([2] * N2), 'int32': np.repeat(np.int32(1), N) } @@ -161,4 +95,4 @@ def setup(self, dtype, downcast): self.data = self.data_dict[dtype] def time_downcast(self, dtype, downcast): - pd.to_numeric(self.data, downcast=downcast) + pd.to_numeric(self.data, downcast=downcast) \ No newline at end of file diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py index c583ac1768c90..ec855e5d33525 100644 --- a/asv_bench/benchmarks/io_sql.py +++ b/asv_bench/benchmarks/io_sql.py @@ -4,121 +4,29 @@ from sqlalchemy import create_engine -class sql_datetime_read_and_parse_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') +#------------------------------------------------------------------------------- +# to_sql - def time_sql_datetime_read_and_parse_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) - - -class sql_datetime_read_as_native_sqlalchemy(object): +class WriteSQL(object): goal_time = 0.2 def setup(self): self.engine = create_engine('sqlite:///:memory:') self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_datetime_read_as_native_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime']) - - -class sql_datetime_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_datetime_write_sqlalchemy(self): - self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') - - -class sql_float_read_query_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_fallback(self): - read_sql_query('SELECT float FROM test_type', self.con) - - -class sql_float_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_sqlalchemy(self): - read_sql_query('SELECT float FROM test_type', self.engine) - - -class sql_float_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_table_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['float']) - - -class sql_float_write_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_float_write_fallback(self): - self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) -class sql_float_write_sqlalchemy(object): - goal_time = 0.2 + def time_fallback(self): + self.df.to_sql('test1', self.con, if_exists='replace') - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan + def time_sqlalchemy(self): + self.df.to_sql('test1', self.engine, if_exists='replace') - def time_sql_float_write_sqlalchemy(self): - self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') +#------------------------------------------------------------------------------- +# read_sql -class sql_read_query_fallback(object): +class ReadSQL(object): goal_time = 0.2 def setup(self): @@ -129,41 +37,20 @@ def setup(self): self.df.to_sql('test2', self.engine, if_exists='replace') self.df.to_sql('test2', self.con, if_exists='replace') - def time_sql_read_query_fallback(self): + def time_read_query_fallback(self): read_sql_query('SELECT * FROM test2', self.con) - -class sql_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_query_sqlalchemy(self): + def time_read_query_sqlalchemy(self): read_sql_query('SELECT * FROM test2', self.engine) - -class sql_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_table_sqlalchemy(self): + def time_read_table_sqlalchemy(self): read_sql_table('test2', self.engine) -class sql_string_write_fallback(object): +#------------------------------------------------------------------------------- +# type specific write + +class WriteSQLTypes(object): goal_time = 0.2 def setup(self): @@ -172,44 +59,47 @@ def setup(self): self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) self.df.loc[1000:3000, 'float'] = np.nan - def time_sql_string_write_fallback(self): + def time_string_fallback(self): self.df[['string']].to_sql('test_string', self.con, if_exists='replace') + def time_string_sqlalchemy(self): + self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') -class sql_string_write_sqlalchemy(object): - goal_time = 0.2 + def time_float_fallback(self): + self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan + def time_float_sqlalchemy(self): + self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') - def time_sql_string_write_sqlalchemy(self): - self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') + def time_datetime_sqlalchemy(self): + self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') -class sql_write_fallback(object): +#------------------------------------------------------------------------------- +# type specific read + +class ReadSQLTypes(object): goal_time = 0.2 def setup(self): self.engine = create_engine('sqlite:///:memory:') self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - def time_sql_write_fallback(self): - self.df.to_sql('test1', self.con, if_exists='replace') + def time_datetime_read_and_parse_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) + def time_datetime_read_as_native_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime']) -class sql_write_sqlalchemy(object): - goal_time = 0.2 + def time_float_read_query_fallback(self): + read_sql_query('SELECT float FROM test_type', self.con) - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + def time_float_read_query_sqlalchemy(self): + read_sql_query('SELECT float FROM test_type', self.engine) - def time_sql_write_sqlalchemy(self): - self.df.to_sql('test1', self.engine, if_exists='replace') + def time_float_read_table_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['float']) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index c98179c8950c5..9eefe80c8e5e4 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,33 +1,20 @@ from .pandas_vb_common import * +try: + from pandas import merge_ordered +except ImportError: + from pandas import ordered_merge as merge_ordered -class append_frame_single_homogenous(object): - goal_time = 0.2 - - def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.df2 = self.df1.copy() - self.df2.index = np.arange(10000, 20000) - self.mdf1 = self.df1.copy() - self.mdf1['obj1'] = 'bar' - self.mdf1['obj2'] = 'bar' - self.mdf1['int1'] = 5 - try: - self.mdf1.consolidate(inplace=True) - except: - pass - self.mdf2 = self.mdf1.copy() - self.mdf2.index = self.df2.index - def time_append_frame_single_homogenous(self): - self.df1.append(self.df2) +#---------------------------------------------------------------------- +# Append - -class append_frame_single_mixed(object): +class Append(object): goal_time = 0.2 def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.df1 = pd.DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() @@ -41,33 +28,17 @@ def setup(self): self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index - def time_append_frame_single_mixed(self): - self.mdf1.append(self.mdf2) - - -class concat_empty_frames1(object): - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() - - def time_concat_empty_frames1(self): - concat([self.df, self.empty]) - - -class concat_empty_frames2(object): - goal_time = 0.2 + def time_append_homogenous(self): + self.df1.append(self.df2) - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() + def time_append_mixed(self): + self.mdf1.append(self.mdf2) - def time_concat_empty_frames2(self): - concat([self.empty, self.df]) +#---------------------------------------------------------------------- +# Concat -class concat_series_axis1(object): +class Concat(object): goal_time = 0.2 def setup(self): @@ -77,21 +48,26 @@ def setup(self): self.pieces = [self.s[i:(- i)] for i in range(1, 10)] self.pieces = (self.pieces * 50) + self.df_small = pd.DataFrame(randn(5, 4)) + + # empty + self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) + self.empty = pd.DataFrame() + def time_concat_series_axis1(self): concat(self.pieces, axis=1) + def time_concat_small_frames(self): + concat(([self.df_small] * 1000)) -class concat_small_frames(object): - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame(randn(5, 4)) + def time_concat_empty_frames1(self): + concat([self.df, self.empty]) - def time_concat_small_frames(self): - concat(([self.df] * 1000)) + def time_concat_empty_frames2(self): + concat([self.empty, self.df]) -class concat_panels(object): +class ConcatPanels(object): goal_time = 0.2 def setup(self): @@ -101,26 +77,26 @@ def setup(self): self.panels_c = [pd.Panel(np.copy(dataset, order='C')) for i in range(20)] - def time_concat_c_ordered_axis0(self): + def time_c_ordered_axis0(self): concat(self.panels_c, axis=0, ignore_index=True) - def time_concat_f_ordered_axis0(self): + def time_f_ordered_axis0(self): concat(self.panels_f, axis=0, ignore_index=True) - def time_concat_c_ordered_axis1(self): + def time_c_ordered_axis1(self): concat(self.panels_c, axis=1, ignore_index=True) - def time_concat_f_ordered_axis1(self): + def time_f_ordered_axis1(self): concat(self.panels_f, axis=1, ignore_index=True) - def time_concat_c_ordered_axis2(self): + def time_c_ordered_axis2(self): concat(self.panels_c, axis=2, ignore_index=True) - def time_concat_f_ordered_axis2(self): + def time_f_ordered_axis2(self): concat(self.panels_f, axis=2, ignore_index=True) -class concat_dataframes(object): +class ConcatFrames(object): goal_time = 0.2 def setup(self): @@ -131,37 +107,23 @@ def setup(self): self.frames_c = [pd.DataFrame(np.copy(dataset, order='C')) for i in range(20)] - def time_concat_c_ordered_axis0(self): + def time_c_ordered_axis0(self): concat(self.frames_c, axis=0, ignore_index=True) - def time_concat_f_ordered_axis0(self): + def time_f_ordered_axis0(self): concat(self.frames_f, axis=0, ignore_index=True) - def time_concat_c_ordered_axis1(self): + def time_c_ordered_axis1(self): concat(self.frames_c, axis=1, ignore_index=True) - def time_concat_f_ordered_axis1(self): + def time_f_ordered_axis1(self): concat(self.frames_f, axis=1, ignore_index=True) -class i8merge(object): - goal_time = 0.2 +#---------------------------------------------------------------------- +# Joins - def setup(self): - (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) - self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) - self.left['left'] = self.left.sum(axis=1) - self.i = np.random.permutation(len(self.left)) - self.right = self.left.iloc[self.i].copy() - self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) - self.right.index = np.arange(len(self.right)) - self.right['right'] *= (-1) - - def time_i8merge(self): - merge(self.left, self.right, how='outer') - - -class join_dataframe_index_multi(object): +class Join(object): goal_time = 0.2 def setup(self): @@ -174,243 +136,193 @@ def setup(self): self.shuf = np.arange(100000) random.shuffle(self.shuf) try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + self.index2 = MultiIndex(levels=[self.level1, self.level2], + labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), + index=self.index2, + columns=['A', 'B', 'C', 'D']) except: pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df = pd.DataFrame({'data1': np.random.randn(100000), + 'data2': np.random.randn(100000), + 'key1': self.key1, + 'key2': self.key2}) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), + index=self.level1, + columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), + index=self.level2, + columns=['A', 'B', 'C', 'D']) self.df_shuf = self.df.reindex(self.df.index[self.shuf]) def time_join_dataframe_index_multi(self): self.df.join(self.df_multi, on=['key1', 'key2']) - -class join_dataframe_index_single_key_bigger(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - def time_join_dataframe_index_single_key_bigger(self): self.df.join(self.df_key2, on='key2') - -class join_dataframe_index_single_key_bigger_sort(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - def time_join_dataframe_index_single_key_bigger_sort(self): self.df_shuf.join(self.df_key2, on='key2', sort=True) + def time_join_dataframe_index_single_key_small(self): + self.df.join(self.df_key1, on='key1') + -class join_dataframe_index_single_key_small(object): +class JoinIndex(object): goal_time = 0.2 def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + np.random.seed(2718281) + self.n = 50000 + self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) + self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') - def time_join_dataframe_index_single_key_small(self): - self.df.join(self.df_key1, on='key1') + def time_left_outer_join_index(self): + self.left.join(self.right, on='jim') -class join_dataframe_integer_2key(object): +class join_non_unique_equal(object): + # outer join of non-unique + # GH 6329 + goal_time = 0.2 def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) - self.df3 = self.df[:5000] + self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') + self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') + self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) + self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) + self.fracofday = Series(self.fracofday, self.daily_dates) + self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') + self.temp = Series(1.0, self.index) + + def time_join_non_unique_equal(self): + (self.fracofday * self.temp[self.fracofday.index]) - def time_join_dataframe_integer_2key(self): - merge(self.df, self.df3) +#---------------------------------------------------------------------- +# Merges -class join_dataframe_integer_key(object): +class Merge(object): goal_time = 0.2 def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) + self.N = 10000 + self.indices = tm.makeStringIndex(self.N).values + self.indices2 = tm.makeStringIndex(self.N).values + self.key = np.tile(self.indices[:8000], 10) + self.key2 = np.tile(self.indices2[:8000], 10) + self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, + 'value': np.random.randn(80000)}) + self.right = pd.DataFrame({'key': self.indices[2000:], + 'key2': self.indices2[2000:], + 'value2': np.random.randn(8000)}) + + self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), + 'key2': np.tile(np.arange(250).repeat(10), 4), + 'value': np.random.randn(10000)}) + self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500)}) self.df3 = self.df[:5000] - def time_join_dataframe_integer_key(self): - merge(self.df, self.df2, on='key1') - + def time_merge_2intkey_nosort(self): + merge(self.left, self.right, sort=False) -class merge_asof_noby(object): + def time_merge_2intkey_sort(self): + merge(self.left, self.right, sort=True) - def setup(self): - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') + def time_merge_dataframe_integer_2key(self): + merge(self.df, self.df3) - def time_merge_asof_noby(self): - merge_asof(self.df1, self.df2, on='time') + def time_merge_dataframe_integer_key(self): + merge(self.df, self.df2, on='key1') -class merge_asof_by_object(object): +class i8merge(object): + goal_time = 0.2 def setup(self): - import string - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'key': np.random.choice(list(string.uppercase), one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'key': np.random.choice(list(string.uppercase), two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') + (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) + self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) + self.left['left'] = self.left.sum(axis=1) + self.i = np.random.permutation(len(self.left)) + self.right = self.left.iloc[self.i].copy() + self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) + self.right.index = np.arange(len(self.right)) + self.right['right'] *= (-1) + + def time_i8merge(self): + merge(self.left, self.right, how='outer') - def time_merge_asof_by_object(self): - merge_asof(self.df1, self.df2, on='time', by='key') +#---------------------------------------------------------------------- +# Ordered merge -class merge_asof_by_int(object): +class MergeOrdered(object): def setup(self): - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'key': np.random.randint(0, 25, one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'key': np.random.randint(0, 25, two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') - def time_merge_asof_by_int(self): - merge_asof(self.df1, self.df2, on='time', by='key') + groups = tm.makeStringIndex(10).values + self.left = pd.DataFrame({'group': groups.repeat(5000), + 'key' : np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) -class join_non_unique_equal(object): - goal_time = 0.2 + self.right = pd.DataFrame({'key' : np.arange(10000), + 'rvalue' : np.random.randn(10000)}) - def setup(self): - self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') - self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') - self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) - self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) - self.fracofday = TimeSeries(self.fracofday, self.daily_dates) - self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') - self.temp = TimeSeries(1.0, self.index) + def time_merge_ordered(self): + merge_ordered(self.left, self.right, on='key', left_by='group') - def time_join_non_unique_equal(self): - (self.fracofday * self.temp[self.fracofday.index]) +# ---------------------------------------------------------------------- +# asof merge -class left_outer_join_index(object): - goal_time = 0.2 +class MergeAsof(object): def setup(self): - np.random.seed(2718281) - self.n = 50000 - self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) - self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') - - def time_left_outer_join_index(self): - self.left.join(self.right, on='jim') - + import string + np.random.seed(0) + one_count = 200000 + two_count = 1000000 -class merge_2intkey_nosort(object): - goal_time = 0.2 + self.df1 = pd.DataFrame( + {'time': np.random.randint(0, one_count / 20, one_count), + 'key': np.random.choice(list(string.uppercase), one_count), + 'key2': np.random.randint(0, 25, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame( + {'time': np.random.randint(0, two_count / 20, two_count), + 'key': np.random.choice(list(string.uppercase), two_count), + 'key2': np.random.randint(0, 25, two_count), + 'value2': np.random.randn(two_count)}) - def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') - def time_merge_2intkey_nosort(self): - merge(self.left, self.right, sort=False) + self.df1a = self.df1[['time', 'value1']] + self.df2a = self.df2[['time', 'value2']] + self.df1b = self.df1[['time', 'key', 'value1']] + self.df2b = self.df2[['time', 'key', 'value2']] + self.df1c = self.df1[['time', 'key2', 'value1']] + self.df2c = self.df2[['time', 'key2', 'value2']] + def time_noby(self): + merge_asof(self.df1a, self.df2a, on='time') -class merge_2intkey_sort(object): - goal_time = 0.2 + def time_by_object(self): + merge_asof(self.df1b, self.df2b, on='time', by='key') - def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + def time_by_int(self): + merge_asof(self.df1c, self.df2c, on='time', by='key2') - def time_merge_2intkey_sort(self): - merge(self.left, self.right, sort=True) +#---------------------------------------------------------------------- +# data alignment -class series_align_int64_index(object): +class Align(object): goal_time = 0.2 def setup(self): @@ -423,30 +335,12 @@ def setup(self): self.ts1 = Series(np.random.randn(self.sz), self.idx1) self.ts2 = Series(np.random.randn(self.sz), self.idx2) - def time_series_align_int64_index(self): - (self.ts1 + self.ts2) - def sample(self, values, k): self.sampler = np.random.permutation(len(values)) return values.take(self.sampler[:k]) - -class series_align_left_monotonic(object): - goal_time = 0.2 - - def setup(self): - self.n = 1000000 - self.sz = 500000 - self.rng = np.arange(0, 10000000000000, 10000000) - self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) - self.idx1 = np.sort(self.sample(self.stamps, self.sz)) - self.idx2 = np.sort(self.sample(self.stamps, self.sz)) - self.ts1 = Series(np.random.randn(self.sz), self.idx1) - self.ts2 = Series(np.random.randn(self.sz), self.idx2) + def time_series_align_int64_index(self): + (self.ts1 + self.ts2) def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join='left') - - def sample(self, values, k): - self.sampler = np.random.permutation(len(values)) - return values.take(self.sampler[:k]) diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py deleted file mode 100644 index f9d577a2b56d7..0000000000000 --- a/asv_bench/benchmarks/miscellaneous.py +++ /dev/null @@ -1,52 +0,0 @@ -from .pandas_vb_common import * -from pandas.util.decorators import cache_readonly - - -class match_strings(object): - goal_time = 0.2 - - def setup(self): - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) - - def time_match_strings(self): - match(self.all, self.uniques) - - -class misc_cache_readonly(object): - goal_time = 0.2 - - def setup(self): - - - class Foo: - - @cache_readonly - def prop(self): - return 5 - self.obj = Foo() - - def time_misc_cache_readonly(self): - self.obj.prop - - -class to_numeric(object): - goal_time = 0.2 - - def setup(self): - self.n = 10000 - self.float = Series(np.random.randn(self.n * 100)) - self.numstr = self.float.astype('str') - self.str = Series(tm.makeStringIndex(self.n)) - - def time_from_float(self): - pd.to_numeric(self.float) - - def time_from_numeric_str(self): - pd.to_numeric(self.numstr) - - def time_from_str_ignore(self): - pd.to_numeric(self.str, errors='ignore') - - def time_from_str_coerce(self): - pd.to_numeric(self.str, errors='coerce') diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 5419571c75b43..cd43e305ead8f 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -8,28 +8,19 @@ from sqlalchemy import create_engine import numpy as np from random import randrange -from pandas.core import common as com - -class packers_read_csv(object): +class _Packers(object): goal_time = 0.2 - def setup(self): + def _setup(self): self.f = '__test__.msg' self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2 = self.df.copy() self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) - self.df.to_csv(self.f) - - def time_packers_read_csv(self): - pd.read_csv(self.f) def remove(self, f): try: @@ -37,22 +28,21 @@ def remove(self, f): except: pass +class Packers(_Packers): + goal_time = 0.2 + + def setup(self): + self._setup() + self.df.to_csv(self.f) + + def time_packers_read_csv(self): + pd.read_csv(self.f) -class packers_read_excel(object): +class packers_read_excel(_Packers): goal_time = 0.2 def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.bio = BytesIO() self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') self.df[:2000].to_excel(self.writer) @@ -62,246 +52,94 @@ def time_packers_read_excel(self): self.bio.seek(0) pd.read_excel(self.bio) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_hdf_store(object): +class packers_read_hdf_store(_Packers): goal_time = 0.2 def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_hdf(self.f, 'df') def time_packers_read_hdf_store(self): pd.read_hdf(self.f, 'df') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_hdf_table(object): - goal_time = 0.2 +class packers_read_hdf_table(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_hdf(self.f, 'df', format='table') def time_packers_read_hdf_table(self): pd.read_hdf(self.f, 'df') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_json(object): - goal_time = 0.2 +class packers_read_json(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df.to_json(self.f, orient='split') self.df.index = np.arange(self.N) def time_packers_read_json(self): pd.read_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_json_date_index(object): - goal_time = 0.2 +class packers_read_json_date_index(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + self._setup() self.remove(self.f) self.df.to_json(self.f, orient='split') def time_packers_read_json_date_index(self): pd.read_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_pack(object): - goal_time = 0.2 +class packers_read_pack(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_msgpack(self.f) def time_packers_read_pack(self): pd.read_msgpack(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_pickle(object): - goal_time = 0.2 +class packers_read_pickle(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_pickle(self.f) def time_packers_read_pickle(self): pd.read_pickle(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_sql(object): - goal_time = 0.2 +class packers_read_sql(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.engine = create_engine('sqlite:///:memory:') self.df2.to_sql('table', self.engine, if_exists='replace') def time_packers_read_sql(self): pd.read_sql_table('table', self.engine) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_stata(object): - goal_time = 0.2 +class packers_read_stata(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df.to_stata(self.f, {'index': 'tc', }) def time_packers_read_stata(self): pd.read_stata(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_stata_with_validation(object): - goal_time = 0.2 +class packers_read_stata_with_validation(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] @@ -311,594 +149,168 @@ def setup(self): def time_packers_read_stata_with_validation(self): pd.read_stata(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_sas7bdat(object): +class packers_read_sas(_Packers): def setup(self): self.f = os.path.join(os.path.dirname(__file__), '..', '..', 'pandas', 'io', 'tests', 'sas', 'data', 'test1.sas7bdat') + self.f2 = os.path.join(os.path.dirname(__file__), '..', '..', + 'pandas', 'io', 'tests', 'sas', 'data', + 'paxraw_d_short.xpt') - def time_packers_read_sas7bdat(self): + def time_read_sas7bdat(self): pd.read_sas(self.f, format='sas7bdat') - -class packers_read_xport(object): - - def setup(self): - self.f = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'io', 'tests', 'sas', 'data', - 'paxraw_d_short.xpt') - - def time_packers_read_xport(self): + def time_read_xport(self): pd.read_sas(self.f, format='xport') -class packers_write_csv(object): - goal_time = 0.2 +class CSV(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_csv(self): + def time_write_csv(self): self.df.to_csv(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_excel_openpyxl(object): - goal_time = 0.2 +class Excel(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.bio = BytesIO() - def time_packers_write_excel_openpyxl(self): + def time_write_excel_openpyxl(self): self.bio.seek(0) self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') self.df[:2000].to_excel(self.writer) self.writer.save() - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_excel_xlsxwriter(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlsxwriter(self): + def time_write_excel_xlsxwriter(self): self.bio.seek(0) self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') self.df[:2000].to_excel(self.writer) self.writer.save() - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_write_excel_xlwt(self): + self.bio.seek(0) + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') + self.df[:2000].to_excel(self.writer) + self.writer.save() -class packers_write_excel_xlwt(object): - goal_time = 0.2 +class HDF(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlwt(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') - self.df[:2000].to_excel(self.writer) - self.writer.save() + self._setup() - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_store(self): + def time_write_hdf_store(self): self.df2.to_hdf(self.f, 'df') - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_table(self): + def time_write_hdf_table(self): self.df2.to_hdf(self.f, 'df', table=True) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json(object): - goal_time = 0.2 +class JSON(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() + self.df_date = self.df.copy() self.df.index = np.arange(self.N) - - def time_packers_write_json(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_lines(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json_lines(self): - self.df.to_json(self.f, orient="records", lines=True) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_T(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json_T(self): - self.df.to_json(self.f, orient='columns') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_date_index(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_json_date_index(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_delta_int_tstamp(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def time_packers_write_json_mixed_delta_int_tstamp(self): - self.df_mixed.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_float_int(self): - self.df_mixed.to_json(self.f, orient='index') + self.df_mixed2 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int_T(object): - goal_time = 0.2 + self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] + self.df_mixed3 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + def time_write_json(self): + self.df.to_json(self.f, orient='split') - def time_packers_write_json_mixed_float_int_T(self): - self.df_mixed.to_json(self.f, orient='columns') + def time_write_json_T(self): + self.df.to_json(self.f, orient='columns') - def teardown(self): - self.remove(self.f) + def time_write_json_date_index(self): + self.df_date.to_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_write_json_mixed_delta_int_tstamp(self): + self.df_mixed.to_json(self.f, orient='split') + def time_write_json_mixed_float_int(self): + self.df_mixed2.to_json(self.f, orient='index') -class packers_write_json_mixed_float_int_str(object): - goal_time = 0.2 + def time_write_json_mixed_float_int_T(self): + self.df_mixed2.to_json(self.f, orient='columns') - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + def time_write_json_mixed_float_int_str(self): + self.df_mixed3.to_json(self.f, orient='split') - def time_packers_write_json_mixed_float_int_str(self): - self.df_mixed.to_json(self.f, orient='split') + def time_write_json_lines(self): + self.df.to_json(self.f, orient="records", lines=True) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_pack(object): - goal_time = 0.2 +class MsgPack(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_pack(self): + def time_write_msgpack(self): self.df2.to_msgpack(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_pickle(object): - goal_time = 0.2 +class Pickle(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_pickle(self): + def time_write_pickle(self): self.df2.to_pickle(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_sql(object): - goal_time = 0.2 +class SQL(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.engine = create_engine('sqlite:///:memory:') - def time_packers_write_sql(self): + def time_write_sql(self): self.df2.to_sql('table', self.engine, if_exists='replace') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_stata(object): - goal_time = 0.2 +class STATA(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_write_stata(self): - self.df.to_stata(self.f, {'index': 'tc', }) + self._setup() - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_stata_with_validation(object): - goal_time = 0.2 + self.df3=self.df.copy() + self.df3['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] + self.df3['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] + self.df3['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] + self.df3['float32_'] = np.array(randn(self.N), dtype=np.float32) - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) + def time_write_stata(self): self.df.to_stata(self.f, {'index': 'tc', }) - def time_packers_write_stata_with_validation(self): - self.df.to_stata(self.f, {'index': 'tc', }) + def time_write_stata_with_validation(self): + self.df3.to_stata(self.f, {'index': 'tc', }) def teardown(self): self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 3370131929c22..25b0b5dd4d1b0 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -20,9 +20,9 @@ import pandas.lib as lib try: - Panel = WidePanel + Panel = Panel except Exception: - pass + Panel = WidePanel # didn't add to namespace until later try: diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 4f6fd4a5a2df8..faedce6c574ec 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,7 +1,7 @@ from .pandas_vb_common import * -class panel_from_dict_all_different_indexes(object): +class Constructors1(object): goal_time = 0.2 def setup(self): @@ -18,7 +18,7 @@ def time_panel_from_dict_all_different_indexes(self): Panel.from_dict(self.data_frames) -class panel_from_dict_equiv_indexes(object): +class Constructors2(object): goal_time = 0.2 def setup(self): @@ -32,7 +32,7 @@ def time_panel_from_dict_equiv_indexes(self): Panel.from_dict(self.data_frames) -class panel_from_dict_same_index(object): +class Constructors3(object): goal_time = 0.2 def setup(self): @@ -46,7 +46,7 @@ def time_panel_from_dict_same_index(self): Panel.from_dict(self.data_frames) -class panel_from_dict_two_different_indexes(object): +class Constructors4(object): goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 0bd572db2211a..ebe278f6e68b5 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,56 +1,24 @@ from .pandas_vb_common import * -class panel_pct_change_items(object): +class PanelMethods(object): goal_time = 0.2 def setup(self): self.index = date_range(start='2000', freq='D', periods=1000) self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - def time_panel_pct_change_items(self): + def time_pct_change_items(self): self.panel.pct_change(1, axis='items') - -class panel_pct_change_major(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_major(self): + def time_pct_change_major(self): self.panel.pct_change(1, axis='major') - -class panel_pct_change_minor(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_minor(self): + def time_pct_change_minor(self): self.panel.pct_change(1, axis='minor') - -class panel_shift(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_shift(self): + def time_shift(self): self.panel.shift(1) - -class panel_shift_minor(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_shift_minor(self): - self.panel.shift(1, axis='minor') + def time_shift_minor(self): + self.panel.shift(1, axis='minor') \ No newline at end of file diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 6dc8bffd6dac9..32bf7e50d1a89 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -1,71 +1,49 @@ from .pandas_vb_common import * import os -from pandas import read_csv, read_table +from pandas import read_csv try: from cStringIO import StringIO except ImportError: from io import StringIO -class read_csv_comment2(object): +class read_csv1(object): goal_time = 0.2 def setup(self): - self.data = ['A,B,C'] - self.data = (self.data + (['1,2,3 # comment'] * 100000)) - self.data = '\n'.join(self.data) - - def time_read_csv_comment2(self): - read_csv(StringIO(self.data), comment='#') - - -class read_csv_default_converter(object): - goal_time = 0.2 - - def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data = (self.data * 200) - - def time_read_csv_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) + self.N = 10000 + self.K = 8 + self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) + self.df.to_csv('test.csv', sep='|') + self.format = (lambda x: '{:,}'.format(x)) + self.df2 = self.df.applymap(self.format) + self.df2.to_csv('test2.csv', sep='|') -class read_csv_default_converter_with_decimal(object): - goal_time = 0.2 + def time_sep(self): + read_csv('test.csv', sep='|') - def setup(self): - self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n -0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n -0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n -0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n -0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n""" - self.data = (self.data * 200) + def time_thousands(self): + read_csv('test.csv', sep='|', thousands=',') - def time_read_csv_default_converter_with_decimal(self): - read_csv(StringIO(self.data), sep=';', header=None, - float_precision=None, decimal=',') + def teardown(self): + os.remove('test.csv') + os.remove('test2.csv') -class read_csv_precise_converter(object): +class read_csv2(object): goal_time = 0.2 def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data = (self.data * 200) + self.data = ['A,B,C'] + self.data = (self.data + (['1,2,3 # comment'] * 100000)) + self.data = '\n'.join(self.data) - def time_read_csv_precise_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='high') + def time_comment(self): + read_csv(StringIO(self.data), comment='#') -class read_csv_roundtrip_converter(object): +class read_csv3(object): goal_time = 0.2 def setup(self): @@ -74,44 +52,33 @@ def setup(self): 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" + self.data2 = self.data.replace(',', ';').replace('.', ',') self.data = (self.data * 200) + self.data2 = (self.data2 * 200) - def time_read_csv_roundtrip_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='round_trip') - - -class read_csv_thou_vb(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.format = (lambda x: '{:,}'.format(x)) - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df = self.df.applymap(self.format) - self.df.to_csv('test.csv', sep='|') - - def time_read_csv_thou_vb(self): - read_csv('test.csv', sep='|', thousands=',') - - def teardown(self): - os.remove('test.csv') + def time_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None) + def time_default_converter_with_decimal(self): + read_csv(StringIO(self.data2), sep=';', header=None, + float_precision=None, decimal=',') -class read_csv_vb(object): - goal_time = 0.2 + def time_default_converter_python_engine(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') - def setup(self): - self.N = 10000 - self.K = 8 - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df.to_csv('test.csv', sep='|') + def time_default_converter_with_decimal_python_engine(self): + read_csv(StringIO(self.data2), sep=';', header=None, + float_precision=None, decimal=',', engine='python') - def time_read_csv_vb(self): - read_csv('test.csv', sep='|') + def time_precise_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision='high') - def teardown(self): - os.remove('test.csv') + def time_roundtrip_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision='round_trip') class read_csv_categorical(object): @@ -125,17 +92,17 @@ def setup(self): 'c': np.random.choice(group1, N).astype('object')}) df.to_csv('strings.csv', index=False) - def time_read_csv_categorical_post(self): + def time_convert_post(self): read_csv('strings.csv').apply(pd.Categorical) - def time_read_csv_categorical_direct(self): + def time_convert_direct(self): read_csv('strings.csv', dtype='category') def teardown(self): os.remove('strings.csv') -class read_table_multiple_date(object): +class read_csv_dateparsing(object): goal_time = 0.2 def setup(self): @@ -143,43 +110,12 @@ def setup(self): self.K = 8 self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' self.data = (self.data * 200) + self.data2 = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' + self.data2 = (self.data2 * 200) - def time_read_table_multiple_date(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[[1, 2], [1, 3]]) - - -class read_table_multiple_date_baseline(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - - def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) - - -class read_csv_default_converter_python_engine(object): - goal_time = 0.2 - - def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_default_converter(self): + def time_multiple_date(self): read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None, engine='python') - + parse_dates=[[1, 2], [1, 3]]) -class read_csv_default_converter_with_decimal_python_engine(object): - goal_time = 0.2 - - def setup(self): - self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_default_converter_with_decimal(self): - read_csv(StringIO(self.data), sep=';', header=None, - float_precision=None, decimal=',', engine='python') + def time_baseline(self): + read_csv(StringIO(self.data2), sep=',', header=None, parse_dates=[1]) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 75b2c2dcacfed..ff5a201057bcd 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,31 +1,33 @@ +import pandas as pd from pandas import Series, Period, PeriodIndex, date_range -class create_period_index_from_date_range(object): +class Constructor(object): goal_time = 0.2 - def time_period_index(self): - # Simulate irregular PeriodIndex - PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + def setup(self): + self.rng = date_range('1985', periods=1000) + self.rng2 = date_range('1985', periods=1000).to_pydatetime() + + def time_from_date_range(self): + PeriodIndex(self.rng, freq='D') + def time_from_pydatetime(self): + PeriodIndex(self.rng2, freq='D') -class period_setitem(object): + +class DataFrame(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) - self.df = DataFrame(index=range(len(self.rng))) - - def time_period_setitem(self): + self.rng = pd.period_range(start='1/1/1990', freq='S', periods=20000) + self.df = pd.DataFrame(index=range(len(self.rng))) + + def time_setitem_period_column(self): self.df['col'] = self.rng -class period_algorithm(object): +class Algorithms(object): goal_time = 0.2 def setup(self): @@ -34,16 +36,16 @@ def setup(self): self.s = Series(data * 1000) self.i = PeriodIndex(data, freq='M') - def time_period_series_drop_duplicates(self): + def time_drop_duplicates_pseries(self): self.s.drop_duplicates() - def time_period_index_drop_duplicates(self): + def time_drop_duplicates_pindex(self): self.i.drop_duplicates() - def time_period_series_value_counts(self): + def time_value_counts_pseries(self): self.s.value_counts() - def time_period_index_value_counts(self): + def time_value_counts_pindex(self): self.i.value_counts() diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 7a4a98e2195c2..3350ddaccc496 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -7,7 +7,7 @@ def date_range(start=None, end=None, periods=None, freq=None): from pandas.tools.plotting import andrews_curves -class plot_timeseries_period(object): +class TimeseriesPlotting(object): goal_time = 0.2 def setup(self): @@ -17,11 +17,11 @@ def setup(self): self.M = 5 self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) - def time_plot_timeseries_period(self): + def time_plot_regular(self): self.df.plot() -class plot_andrews_curves(object): +class Misc(object): goal_time = 0.6 def setup(self): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index b1c039058ff8f..8db0cd7629332 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -2,175 +2,52 @@ from random import shuffle -class dataframe_reindex(object): +class Reindexing(object): goal_time = 0.2 def setup(self): - self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute()) - self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, columns=range(10)) + self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, + columns=range(10)) self.df['foo'] = 'bar' self.rng2 = Index(self.rng[::2]) - def time_dataframe_reindex(self): - self.df.reindex(self.rng2) - - -class frame_drop_dup_inplace(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_dup_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_dup_na_inplace(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_dup_na_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_duplicates(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_duplicates(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_drop_duplicates_int(object): - - def setup(self): - np.random.seed(1234) - self.N = 1000000 - self.K = 10000 - self.key1 = np.random.randint(0,self.K,size=self.N) - self.df = DataFrame({'key1': self.key1}) - - def time_frame_drop_duplicates_int(self): - self.df.drop_duplicates() - - -class frame_drop_duplicates_na(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_duplicates_na(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_fillna_many_columns_pad(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') - - -class frame_reindex_columns(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) - - def time_frame_reindex_columns(self): - self.df.reindex(columns=self.df.columns[1:5]) - + self.df2 = DataFrame(index=range(10000), + data=np.random.rand(10000, 30), columns=range(30)) -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) - - -class lib_fast_zip(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) + # multi-index + N = 1000 + K = 20 + level1 = tm.makeStringIndex(N).values.repeat(K) + level2 = np.tile(tm.makeStringIndex(K).values, N) + index = MultiIndex.from_arrays([level1, level2]) + self.s1 = Series(np.random.randn((N * K)), index=index) + self.s2 = self.s1[::2] - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) + def time_reindex_dates(self): + self.df.reindex(self.rng2) + def time_reindex_columns(self): + self.df2.reindex(columns=self.df.columns[1:5]) -class lib_fast_zip_fillna(object): - goal_time = 0.2 + def time_reindex_multiindex(self): + self.s1.reindex(self.s2.index) - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list) +#---------------------------------------------------------------------- +# Pad / backfill -class reindex_daterange_backfill(object): +class FillMethod(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.rng = date_range('1/1/2000', periods=100000, freq='1min') self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) self.ts2 = self.ts[::2] self.ts3 = self.ts2.reindex(self.ts.index) self.ts4 = self.ts3.astype('float32') - def time_reindex_daterange_backfill(self): - self.backfill(self.ts2, self.ts.index) - def pad(self, source_series, target_index): try: source_series.reindex(target_index, method='pad') @@ -183,215 +60,143 @@ def backfill(self, source_series, target_index): except: source_series.reindex(target_index, fillMethod='backfill') + def time_backfill_dates(self): + self.backfill(self.ts2, self.ts.index) -class reindex_daterange_pad(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_daterange_pad(self): + def time_pad_daterange(self): self.pad(self.ts2, self.ts.index) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_fillna_backfill(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_fillna_backfill(self): + def time_backfill(self): self.ts3.fillna(method='backfill') - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_fillna_backfill_float32(object): - goal_time = 0.2 + def time_backfill_float32(self): + self.ts4.fillna(method='backfill') - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + def time_pad(self): + self.ts3.fillna(method='pad') - def time_reindex_fillna_backfill_float32(self): - self.ts4.fillna(method='backfill') + def time_pad_float32(self): + self.ts4.fillna(method='pad') - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +#---------------------------------------------------------------------- +# align on level -class reindex_fillna_pad(object): +class LevelAlign(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + self.index = MultiIndex( + levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), + index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), + index=self.index.levels[1]) - def time_reindex_fillna_pad(self): - self.ts3.fillna(method='pad') + def time_align_level(self): + self.df.align(self.df_level, level=1, copy=False) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_reindex_level(self): + self.df_level.reindex(self.df.index, level=1) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +#---------------------------------------------------------------------- +# drop_duplicates -class reindex_fillna_pad_float32(object): + +class Duplicates(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, + 'value': np.random.randn((self.N * self.K)),}) + self.col_array_list = list(self.df.values.T) - def time_reindex_fillna_pad_float32(self): - self.ts4.fillna(method='pad') + self.df2 = self.df.copy() + self.df2.ix[:10000, :] = np.nan - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + self.s = Series(np.random.randint(0, 1000, size=10000)) + self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') + np.random.seed(1234) + self.N = 1000000 + self.K = 10000 + self.key1 = np.random.randint(0, self.K, size=self.N) + self.df_int = DataFrame({'key1': self.key1}) + def time_frame_drop_dups(self): + self.df.drop_duplicates(['key1', 'key2']) -class reindex_frame_level_align(object): - goal_time = 0.2 + def time_frame_drop_dups_inplace(self): + self.df.drop_duplicates(['key1', 'key2'], inplace=True) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_frame_drop_dups_na(self): + self.df2.drop_duplicates(['key1', 'key2']) - def time_reindex_frame_level_align(self): - self.df.align(self.df_level, level=1, copy=False) + def time_frame_drop_dups_na_inplace(self): + self.df2.drop_duplicates(['key1', 'key2'], inplace=True) + def time_series_drop_dups_int(self): + self.s.drop_duplicates() -class reindex_frame_level_reindex(object): - goal_time = 0.2 + def time_series_drop_dups_string(self): + self.s2.drop_duplicates() - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_frame_drop_dups_int(self): + self.df_int.drop_duplicates() - def time_reindex_frame_level_reindex(self): - self.df_level.reindex(self.df.index, level=1) +#---------------------------------------------------------------------- +# blog "pandas escaped the zoo" -class reindex_multiindex(object): + +class Align(object): goal_time = 0.2 def setup(self): - self.N = 1000 - self.K = 20 - self.level1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.level2 = np.tile(tm.makeStringIndex(self.K).values, self.N) - self.index = MultiIndex.from_arrays([self.level1, self.level2]) - self.s1 = Series(np.random.randn((self.N * self.K)), index=self.index) - self.s2 = self.s1[::2] - - def time_reindex_multiindex(self): - self.s1.reindex(self.s2.index) - + n = 50000 + indices = tm.makeStringIndex(n) + subsample_size = 40000 -class series_align_irregular_string(object): - goal_time = 0.2 + def sample(values, k): + sampler = np.arange(len(values)) + shuffle(sampler) + return values.take(sampler[:k]) - def setup(self): - self.n = 50000 - self.indices = tm.makeStringIndex(self.n) - self.subsample_size = 40000 - self.x = Series(np.random.randn(50000), self.indices) - self.y = Series(np.random.randn(self.subsample_size), index=self.sample(self.indices, self.subsample_size)) + self.x = Series(np.random.randn(50000), indices) + self.y = Series(np.random.randn(subsample_size), + index=sample(indices, subsample_size)) - def time_series_align_irregular_string(self): + def time_align_series_irregular_string(self): (self.x + self.y) - def sample(self, values, k): - self.sampler = np.arange(len(values)) - shuffle(self.sampler) - return values.take(self.sampler[:k]) - -class series_drop_duplicates_int(object): +class LibFastZip(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - - def time_series_drop_duplicates_int(self): - self.s.drop_duplicates() - + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) -class series_drop_duplicates_string(object): - goal_time = 0.2 + self.df2 = self.df.copy() + self.df2.ix[:10000, :] = np.nan + self.col_array_list2 = list(self.df2.values.T) - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) - def time_series_drop_duplicates_string(self): - self.s2.drop_duplicates() + def time_lib_fast_zip_fillna(self): + lib.fast_zip_fillna(self.col_array_list2) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d64606214ca6a..c1600d4e07f58 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -4,390 +4,104 @@ import pandas.util.testing as testing -class strings_cat(object): +class StringMethods(object): goal_time = 0.2 - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_cat(self): - self.many.str.cat(sep=',') - def make_series(self, letters, strlen, size): return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - -class strings_center(object): - goal_time = 0.2 - def setup(self): self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') - def time_strings_center(self): - self.many.str.center(100) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_few(object): - goal_time = 0.2 + def time_cat(self): + self.many.str.cat(sep=',') - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_center(self): + self.many.str.center(100) - def time_strings_contains_few(self): + def time_contains_few(self): self.few.str.contains('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_few_noregex(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_few_noregex(self): + def time_contains_few_noregex(self): self.few.str.contains('matchthis', regex=False) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_many(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many(self): + def time_contains_many(self): self.many.str.contains('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_many_noregex(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many_noregex(self): + def time_contains_many_noregex(self): self.many.str.contains('matchthis', regex=False) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_count(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_count(self): + def time_count(self): self.many.str.count('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_encode_decode(object): - goal_time = 0.2 - - def setup(self): - self.ser = Series(testing.makeUnicodeIndex()) - - def time_strings_encode_decode(self): - self.ser.str.encode('utf-8').str.decode('utf-8') - - -class strings_endswith(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_endswith(self): + def time_endswith(self): self.many.str.endswith('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_extract(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_extract(self): + def time_extract(self): self.many.str.extract('(\\w*)matchthis(\\w*)') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_findall(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_findall(self): + def time_findall(self): self.many.str.findall('[A-Z]+') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_get(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_get(self): + def time_get(self): self.many.str.get(0) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_get_dummies(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') - - def time_strings_get_dummies(self): - self.s.str.get_dummies('|') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split(self): + def time_join_split(self): self.many.str.join('--').str.split('--') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split_expand(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split_expand(self): + def time_join_split_expand(self): self.many.str.join('--').str.split('--', expand=True) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_len(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_len(self): + def time_len(self): self.many.str.len() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_lower(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lower(self): - self.many.str.lower() - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_lstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lstrip(self): - self.many.str.lstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_match(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_match(self): + def time_match(self): self.many.str.match('mat..this') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_pad(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_pad(self): + def time_pad(self): self.many.str.pad(100, side='both') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_repeat(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_repeat(self): + def time_repeat(self): self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_replace(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_replace(self): + def time_replace(self): self.many.str.replace('(matchthis)', '\x01\x01') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_rstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_rstrip(self): - self.many.str.rstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_slice(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_slice(self): + def time_slice(self): self.many.str.slice(5, 15, 2) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_startswith(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_startswith(self): + def time_startswith(self): self.many.str.startswith('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_strip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_strip(self): + def time_strip(self): self.many.str.strip('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_rstrip(self): + self.many.str.rstrip('matchthis') + def time_lstrip(self): + self.many.str.lstrip('matchthis') -class strings_title(object): - goal_time = 0.2 + def time_title(self): + self.many.str.title() - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_upper(self): + self.many.str.upper() - def time_strings_title(self): - self.many.str.title() + def time_lower(self): + self.many.str.lower() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_get_dummies(self): + self.s.str.get_dummies('|') -class strings_upper(object): +class StringEncode(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_upper(self): - self.many.str.upper() + self.ser = Series(testing.makeUnicodeIndex()) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_encode_decode(self): + self.ser.str.encode('utf-8').str.decode('utf-8') diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 8470525dd01fa..c112d1ef72eb8 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -2,54 +2,36 @@ from pandas import to_timedelta, Timestamp -class timedelta_convert_int(object): +class ToTimedelta(object): goal_time = 0.2 def setup(self): self.arr = np.random.randint(0, 1000, size=10000) + self.arr2 = ['{0} days'.format(i) for i in self.arr] - def time_timedelta_convert_int(self): - to_timedelta(self.arr, unit='s') - + self.arr3 = np.random.randint(0, 60, size=10000) + self.arr3 = ['00:00:{0:02d}'.format(i) for i in self.arr3] -class timedelta_convert_string(object): - goal_time = 0.2 - - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr = ['{0} days'.format(i) for i in self.arr] - - def time_timedelta_convert_string(self): - to_timedelta(self.arr) - - -class timedelta_convert_string_seconds(object): - goal_time = 0.2 - - def setup(self): - self.arr = np.random.randint(0, 60, size=10000) - self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] - - def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) + self.arr4 = list(self.arr2) + self.arr4[-1] = 'apple' + def time_convert_int(self): + to_timedelta(self.arr, unit='s') -class timedelta_convert_bad_parse(object): - goal_time = 0.2 + def time_convert_string(self): + to_timedelta(self.arr2) - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr = ['{0} days'.format(i) for i in self.arr] - self.arr[-1] = 'apple' + def time_convert_string_seconds(self): + to_timedelta(self.arr3) - def time_timedelta_convert_coerce(self): - to_timedelta(self.arr, errors='coerce') + def time_convert_coerce(self): + to_timedelta(self.arr4, errors='coerce') - def time_timedelta_convert_ignore(self): - to_timedelta(self.arr, errors='ignore') + def time_convert_ignore(self): + to_timedelta(self.arr4, errors='ignore') -class timedelta_add_overflow(object): +class Ops(object): goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 8c00924cb07ef..6e9ef4b10273c 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -10,295 +10,211 @@ from pandas.tseries.frequencies import infer_freq import numpy as np +if hasattr(Series, 'convert'): + Series.resample = Series.convert -class dataframe_resample_max_numpy(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - def time_dataframe_resample_max_numpy(self): - self.df.resample('1s', how=np.max) - - -class dataframe_resample_max_string(object): +class DatetimeIndex(object): goal_time = 0.2 def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - - def time_dataframe_resample_max_string(self): - self.df.resample('1s', how='max') - + self.delta_offset = pd.offsets.Day() + self.fast_offset = pd.offsets.DateOffset(months=2, days=2) + self.slow_offset = pd.offsets.BusinessDay() -class dataframe_resample_mean_numpy(object): - goal_time = 0.2 + self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + self.index_repeated = date_range(start='1/1/2000', periods=1000, freq='T').repeat(10) - def time_dataframe_resample_mean_numpy(self): - self.df.resample('1s', how=np.mean) + self.rng3 = date_range(start='1/1/2000', periods=1000, freq='H') + self.df = DataFrame(np.random.randn(len(self.rng3), 2), self.rng3) + self.rng4 = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') + self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), index=self.rng4) -class dataframe_resample_mean_string(object): - goal_time = 0.2 + N = 100000 + self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) + self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, + tz='Asia/Tokyo').repeat(5) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + self.rng5 = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') - def time_dataframe_resample_mean_string(self): - self.df.resample('1s', how='mean') + self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + self.N = 10000 + self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') -class dataframe_resample_min_numpy(object): - goal_time = 0.2 + self.rng7 = date_range(start='1/1/1700', freq='D', periods=100000) + self.a = self.rng7[:50000].append(self.rng7[50002:]) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def time_add_timedelta(self): + (self.rng + timedelta(minutes=2)) - def time_dataframe_resample_min_numpy(self): - self.df.resample('1s', how=np.min) + def time_add_offset_delta(self): + (self.rng + self.delta_offset) + def time_add_offset_fast(self): + (self.rng + self.fast_offset) -class dataframe_resample_min_string(object): - goal_time = 0.2 + def time_add_offset_slow(self): + (self.rng + self.slow_offset) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def time_normalize(self): + self.rng2.normalize() - def time_dataframe_resample_min_string(self): - self.df.resample('1s', how='min') + def time_unique(self): + self.index_repeated.unique() + def time_reset_index(self): + self.df.reset_index() -class datetimeindex_add_offset(object): - goal_time = 0.2 + def time_reset_index_tz(self): + self.df2.reset_index() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='T') + def time_dti_factorize(self): + self.dti.factorize() - def time_datetimeindex_add_offset(self): - (self.rng + timedelta(minutes=2)) + def time_dti_tz_factorize(self): + self.dti_tz.factorize() + def time_timestamp_tzinfo_cons(self): + self.rng5[0] -class datetimeindex_converter(object): - goal_time = 0.2 + def time_infer_dst(self): + self.index.tz_localize('US/Eastern', infer_dst=True) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + def time_timeseries_is_month_start(self): + self.rng6.is_month_start - def time_datetimeindex_converter(self): - DatetimeConverter.convert(self.rng, None, None) + def time_infer_freq(self): + infer_freq(self.a) -class datetimeindex_infer_dst(object): +class TimeDatetimeConverter(object): goal_time = 0.2 def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) - def time_datetimeindex_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) + def time_convert(self): + DatetimeConverter.convert(self.rng, None, None) -class datetimeindex_normalize(object): +class Iteration(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') - - def time_datetimeindex_normalize(self): - self.rng.normalize() - - -class datetimeindex_unique(object): - goal_time = 0.2 + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='T') - self.index = self.rng.repeat(10) + def iter_n(self, iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break - def time_datetimeindex_unique(self): - self.index.unique() + def time_iter_datetimeindex(self): + self.iter_n(self.idx1) + def time_iter_datetimeindex_preexit(self): + self.iter_n(self.idx1, self.M) -class dti_reset_index(object): - goal_time = 0.2 + def time_iter_periodindex(self): + self.iter_n(self.idx2) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H') - self.df = DataFrame(np.random.randn(len(self.rng), 2), self.rng) + def time_iter_periodindex_preexit(self): + self.iter_n(self.idx2, self.M) - def time_dti_reset_index(self): - self.df.reset_index() +#---------------------------------------------------------------------- +# Resampling -class dti_reset_index_tz(object): +class ResampleDataFrame(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df = DataFrame(np.random.randn(len(self.rng), 2), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - def time_dti_reset_index_tz(self): - self.df.reset_index() + def time_max_numpy(self): + self.df.resample('1s', how=np.max) + def time_max_string(self): + self.df.resample('1s', how='max') -class datetime_algorithm(object): - goal_time = 0.2 + def time_mean_numpy(self): + self.df.resample('1s', how=np.mean) - def setup(self): - N = 100000 - self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) - self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, - tz='Asia/Tokyo').repeat(5) + def time_mean_string(self): + self.df.resample('1s', how='mean') - def time_dti_factorize(self): - self.dti.factorize() + def time_min_numpy(self): + self.df.resample('1s', how=np.min) - def time_dti_tz_factorize(self): - self.dti_tz.factorize() + def time_min_string(self): + self.df.resample('1s', how='min') -class timeseries_1min_5min_mean(object): +class ResampleSeries(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_1min_5min_mean(self): - self.ts[:10000].resample('5min', how='mean') + self.rng1 = period_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts1 = Series(np.random.randn(len(self.rng1)), index=self.rng1) + self.rng2 = date_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts2 = Series(np.random.randn(len(self.rng2)), index=self.rng2) -class timeseries_1min_5min_ohlc(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') + self.int_ts = Series(5, self.rng3, dtype='int64') + self.dt_ts = self.int_ts.astype('datetime64[ns]') - def time_timeseries_1min_5min_ohlc(self): - self.ts[:10000].resample('5min', how='ohlc') + def time_period_downsample_mean(self): + self.ts1.resample('D', how='mean') + def time_timestamp_downsample_mean(self): + self.ts2.resample('D', how='mean') -class timeseries_add_irregular(object): - goal_time = 0.2 + def time_resample_datetime64(self): + # GH 7754 + self.dt_ts.resample('1S', how='last') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.lindex = np.random.permutation(self.N)[:(self.N // 2)] - self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts.values.take(self.lindex), index=self.ts.index.take(self.lindex)) - self.right = Series(self.ts.values.take(self.rindex), index=self.ts.index.take(self.rindex)) + def time_1min_5min_mean(self): + self.ts2[:10000].resample('5min', how='mean') - def time_timeseries_add_irregular(self): - (self.left + self.right) + def time_1min_5min_ohlc(self): + self.ts2[:10000].resample('5min', how='ohlc') -class timeseries_asof(object): +class AsOf(object): goal_time = 0.2 def setup(self): self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') self.ts2 = self.ts.copy() self.ts2[250:5000] = np.nan self.ts3 = self.ts.copy() self.ts3[-5000:] = np.nan # test speed of pre-computing NAs. - def time_asof_list(self): + def time_asof(self): self.ts.asof(self.dates) # should be roughly the same as above. - def time_asof_nan_list(self): + def time_asof_nan(self): self.ts2.asof(self.dates) # test speed of the code path for a scalar index @@ -318,7 +234,7 @@ def time_asof_nan_single(self): self.ts3.asof(self.dates[-1]) -class timeseries_dataframe_asof(object): +class AsOfDataFrame(object): goal_time = 0.2 def setup(self): @@ -333,11 +249,11 @@ def setup(self): self.ts3.iloc[-5000:] = np.nan # test speed of pre-computing NAs. - def time_asof_list(self): + def time_asof(self): self.ts.asof(self.dates) # should be roughly the same as above. - def time_asof_nan_list(self): + def time_asof_nan(self): self.ts2.asof(self.dates) # test speed of the code path for a scalar index @@ -356,107 +272,105 @@ def time_asof_single_early(self): self.ts.asof(self.dates[0] - dt.timedelta(10)) -class timeseries_custom_bday_apply(object): +class TimeSeries(object): goal_time = 0.2 def setup(self): self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert + self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') + self.rng = self.rng.take(np.random.permutation(self.N)) self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - def time_timeseries_custom_bday_apply(self): - self.cday.apply(self.date) + self.rng2 = date_range(start='1/1/2000', periods=self.N, freq='T') + self.ts2 = Series(np.random.randn(self.N), index=self.rng2) + self.lindex = np.random.permutation(self.N)[:(self.N // 2)] + self.rindex = np.random.permutation(self.N)[:(self.N // 2)] + self.left = Series(self.ts2.values.take(self.lindex), index=self.ts2.index.take(self.lindex)) + self.right = Series(self.ts2.values.take(self.rindex), index=self.ts2.index.take(self.rindex)) -class timeseries_custom_bday_apply_dt64(object): - goal_time = 0.2 + self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') + self.ts3 = Series(1, index=self.rng3) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_sort_index(self): + self.ts.sort_index() - def time_timeseries_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) + def time_timeseries_slice_minutely(self): + self.ts2[:10000] + def time_add_irregular(self): + (self.left + self.right) + + def time_large_lookup_value(self): + self.ts3[self.ts3.index[(len(self.ts3) // 2)]] + self.ts3.index._cleanup() -class timeseries_custom_bday_cal_decr(object): + +class SeriesArithmetic(object): goal_time = 0.2 def setup(self): self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) + self.delta_offset = pd.offsets.Day() + self.fast_offset = pd.offsets.DateOffset(months=2, days=2) + self.slow_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + (self.s + self.delta_offset) - def time_timeseries_custom_bday_cal_decr(self): - (self.date - (1 * self.cdayh)) + def time_add_offset_fast(self): + (self.s + self.fast_offset) + + def time_add_offset_slow(self): + (self.s + self.slow_offset) -class timeseries_custom_bday_cal_incr(object): +class ToDatetime(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + self.rng = date_range(start='1/1/2000', periods=10000, freq='D') + self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) + + self.rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' + for x in self.rng] + + self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) + self.s2 = self.s.str.replace(':\\S+$', '') - def time_timeseries_custom_bday_cal_incr(self): - (self.date + (1 * self.cdayh)) + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format='%Y%m%d') + def time_iso8601(self): + to_datetime(self.strings) + + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + + def time_iso8601_format(self): + to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') + + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + + def time_format_exact(self): + to_datetime(self.s2, format='%d%b%y') + + def time_format_no_exact(self): + to_datetime(self.s, format='%d%b%y', exact=False) -class timeseries_custom_bday_cal_incr_n(object): + +class Offsets(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.date = dt.datetime(2011, 1, 1) self.dt64 = np.datetime64('2011-01-01 09:00Z') self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() @@ -467,718 +381,63 @@ def setup(self): self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - def time_timeseries_custom_bday_cal_incr_n(self): - (self.date + (10 * self.cdayh)) - - -class timeseries_custom_bday_cal_incr_neg_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_timeseries_day_apply(self): + self.day.apply(self.date) - def time_timeseries_custom_bday_cal_incr_neg_n(self): - (self.date - (10 * self.cdayh)) + def time_timeseries_day_incr(self): + (self.date + self.day) + def time_timeseries_year_apply(self): + self.year.apply(self.date) -class timeseries_custom_bday_decr(object): - goal_time = 0.2 + def time_timeseries_year_incr(self): + (self.date + self.year) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + # custom business offsets - def time_timeseries_custom_bday_decr(self): + def time_custom_bday_decr(self): (self.date - self.cday) - -class timeseries_custom_bday_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_incr(self): + def time_custom_bday_incr(self): (self.date + self.cday) + def time_custom_bday_apply(self): + self.cday.apply(self.date) -class timeseries_custom_bmonthbegin_decr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - -class timeseries_custom_bmonthbegin_incr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class timeseries_custom_bmonthend_decr_n(object): - goal_time = 0.2 + def time_custom_bday_apply_dt64(self): + self.cday.apply(self.dt64) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_custom_bday_cal_incr(self): + self.date + 1 * self.cdayh - def time_timeseries_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) + def time_custom_bday_cal_decr(self): + self.date - 1 * self.cdayh + def time_custom_bday_cal_incr_n(self): + self.date + 10 * self.cdayh -class timeseries_custom_bmonthend_incr(object): - goal_time = 0.2 + def time_custom_bday_cal_incr_neg_n(self): + self.date - 10 * self.cdayh - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + # Increment custom business month - def time_timeseries_custom_bmonthend_incr(self): + def time_custom_bmonthend_incr(self): (self.date + self.cme) - -class timeseries_custom_bmonthend_incr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthend_incr_n(self): + def time_custom_bmonthend_incr_n(self): (self.date + (10 * self.cme)) + def time_custom_bmonthend_decr_n(self): + (self.date - (10 * self.cme)) -class timeseries_datetimeindex_offset_delta(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_delta(self): - (self.idx1 + self.delta_offset) - - -class timeseries_datetimeindex_offset_fast(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_fast(self): - (self.idx1 + self.fast_offset) - - -class timeseries_datetimeindex_offset_slow(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_slow(self): - (self.idx1 + self.slow_offset) - - -class timeseries_day_apply(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_apply(self): - self.day.apply(self.date) - - -class timeseries_day_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_incr(self): - (self.date + self.day) - - -class timeseries_infer_freq(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/1700', freq='D', periods=100000) - self.a = self.rng[:50000].append(self.rng[50002:]) - - def time_timeseries_infer_freq(self): - infer_freq(self.a) - - -class timeseries_is_month_start(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1', periods=self.N, freq='B') - - def time_timeseries_is_month_start(self): - self.rng.is_month_start - - -class timeseries_iter_datetimeindex(object): - goal_time = 0.2 + def time_custom_bmonthbegin_decr_n(self): + (self.date - (10 * self.cmb)) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex(self): - self.iter_n(self.idx1) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_datetimeindex_preexit(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex_preexit(self): - self.iter_n(self.idx1, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_periodindex(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex(self): - self.iter_n(self.idx2) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_periodindex_preexit(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex_preexit(self): - self.iter_n(self.idx2, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_large_lookup_value(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1500000, freq='S') - self.ts = Series(1, index=self.rng) - - def time_timeseries_large_lookup_value(self): - self.ts[self.ts.index[(len(self.ts) // 2)]] - self.ts.index._cleanup() - - -class timeseries_period_downsample_mean(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_timeseries_period_downsample_mean(self): - self.ts.resample('D', how='mean') - - -class timeseries_resample_datetime64(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') - self.int_ts = Series(5, self.rng, dtype='int64') - self.ts = self.int_ts.astype('datetime64[ns]') - - def time_timeseries_resample_datetime64(self): - self.ts.resample('1S', how='last') - - -class timeseries_series_offset_delta(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_delta(self): - (self.s + self.delta_offset) - - -class timeseries_series_offset_fast(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_fast(self): - (self.s + self.fast_offset) - - -class timeseries_series_offset_slow(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_slow(self): - (self.s + self.slow_offset) - - -class timeseries_slice_minutely(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_slice_minutely(self): - self.ts[:10000] - - -class timeseries_sort_index(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') - self.rng = self.rng.take(np.random.permutation(self.N)) - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_sort_index(self): - self.ts.sort_index() - - -class timeseries_timestamp_downsample_mean(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_timeseries_timestamp_downsample_mean(self): - self.ts.resample('D', how='mean') - - -class timeseries_timestamp_tzinfo_cons(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') - - def time_timeseries_timestamp_tzinfo_cons(self): - self.rng[0] - - -class timeseries_to_datetime_YYYYMMDD(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.strings = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) - - def time_timeseries_to_datetime_YYYYMMDD(self): - to_datetime(self.strings, format='%Y%m%d') - - -class timeseries_to_datetime_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] - self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' - for x in self.rng] - - def time_timeseries_to_datetime_iso8601(self): - to_datetime(self.strings) - - def time_timeseries_to_datetime_iso8601_nosep(self): - to_datetime(self.strings_nosep) - - def time_timeseries_to_datetime_iso8601_format(self): - to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') - - def time_timeseries_to_datetime_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') - - def time_timeseries_to_datetime_iso8601_tz_spaceformat(self): - to_datetime(self.strings_tz_space) - - -class timeseries_with_format_no_exact(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) - - -class timeseries_with_format_replace(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_replace(self): - to_datetime(self.s.str.replace(':\\S+$', ''), format='%d%b%y') - - -class timeseries_year_apply(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_apply(self): - self.year.apply(self.date) - - -class timeseries_year_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_incr(self): - (self.date + self.year) + def time_custom_bmonthbegin_incr_n(self): + (self.date + (10 * self.cmb)) -class timeseries_semi_month_offset(object): +class SemiMonthOffset(object): goal_time = 0.2 def setup(self): @@ -1189,50 +448,50 @@ def setup(self): self.semi_month_end = pd.offsets.SemiMonthEnd() self.semi_month_begin = pd.offsets.SemiMonthBegin() - def time_semi_month_end_apply(self): + def time_end_apply(self): self.semi_month_end.apply(self.date) - def time_semi_month_end_incr(self): + def time_end_incr(self): self.date + self.semi_month_end - def time_semi_month_end_incr_n(self): + def time_end_incr_n(self): self.date + 10 * self.semi_month_end - def time_semi_month_end_decr(self): + def time_end_decr(self): self.date - self.semi_month_end - def time_semi_month_end_decr_n(self): + def time_end_decr_n(self): self.date - 10 * self.semi_month_end - def time_semi_month_end_apply_index(self): + def time_end_apply_index(self): self.semi_month_end.apply_index(self.rng) - def time_semi_month_end_incr_rng(self): + def time_end_incr_rng(self): self.rng + self.semi_month_end - def time_semi_month_end_decr_rng(self): + def time_end_decr_rng(self): self.rng - self.semi_month_end - def time_semi_month_begin_apply(self): + def time_begin_apply(self): self.semi_month_begin.apply(self.date) - def time_semi_month_begin_incr(self): + def time_begin_incr(self): self.date + self.semi_month_begin - def time_semi_month_begin_incr_n(self): + def time_begin_incr_n(self): self.date + 10 * self.semi_month_begin - def time_semi_month_begin_decr(self): + def time_begin_decr(self): self.date - self.semi_month_begin - def time_semi_month_begin_decr_n(self): + def time_begin_decr_n(self): self.date - 10 * self.semi_month_begin - def time_semi_month_begin_apply_index(self): + def time_begin_apply_index(self): self.semi_month_begin.apply_index(self.rng) - def time_semi_month_begin_incr_rng(self): + def time_begin_incr_rng(self): self.rng + self.semi_month_begin - def time_semi_month_begin_decr_rng(self): + def time_begin_decr_rng(self): self.rng - self.semi_month_begin From 000e8d4f2fbfbf1f492381f014ea16d37d2cb92e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Dec 2016 10:12:48 -0500 Subject: [PATCH 147/183] BLD: try new gh token for pandas-docs --- .travis.yml | 2 +- ci/build_docs.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7de67476f5ec4..4be2bf2a105ff 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ env: global: # pandas-docs-travis GH - - secure: "UJK7kUtkcnV9PFP4IBXAvgmRQKdwARlfqF4UZQ5tBwrpnD1a3n7FLBijcuXQ3jkvwpEc/FZB9RJDXmsqYXJPvq3BC++2Cv2tFDvKr/c+y8KffszAyVk47jKEHMNmGgauwaNMggsE/rH8YHe4so9LsJHTRbzmLo8lXPNTldoIu5s=" + - secure: Oz6cwVu3NetKZ5nKLJ4RQQBbRJE4KF3J5fkVwICJ2SQaE00ng8os4zJRGSkf0g+K1AVJpQ9A1XKG/IOKMBSkGiXBaVR/Qk/5b+QOTjEhgQBd7tdYjBrFfzpn0AIWn+70nAh24pvuGmq5MU9ILUXwNVVM87FF7cJ7efNseveh7Ss= git: # for cloning diff --git a/ci/build_docs.sh b/ci/build_docs.sh index d55dce1344a64..79f004319c388 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -55,7 +55,8 @@ if [ x"$DOC_BUILD" != x"" ]; then touch .nojekyll git add --all . git commit -m "Version" --allow-empty - git remote add origin "https://pandas-docs:$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis" + git remote remove origin + git remote add origin "https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis.git" git push origin gh-pages -f fi From 34807fc25e7a9dd9a04b0e438f1704c13fc5cae5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Dec 2016 10:31:53 -0500 Subject: [PATCH 148/183] TST: skip testing on windows for specific formatting which sometimes hangs (#14851) xref #14626 --- pandas/tests/indexes/test_base.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 628095a2fcbd3..2dfeb7da07a3d 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -915,23 +915,12 @@ def test_format(self): self._check_method_works(Index.format) # GH 14626 - # our formatting is different by definition when we have - # ms vs us precision (e.g. trailing zeros); - # so don't compare this case - def datetime_now_without_trailing_zeros(): - now = datetime.now() - - while str(now).endswith("000"): - now = datetime.now() - - return now - - index = Index([datetime_now_without_trailing_zeros()]) - # windows has different precision on datetime.datetime.now (it doesn't # include us since the default for Timestamp shows these but Index - # formating does not we are skipping - if not is_platform_windows(): + # formating does not we are skipping) + now = datetime.now() + if not str(now).endswith("000"): + index = Index([now]) formatted = index.format() expected = [str(index[0])] self.assertEqual(formatted, expected) From d531718749ed686a975cae92a13e9ab9bd5aac6d Mon Sep 17 00:00:00 2001 From: Pawel Kordek Date: Sat, 10 Dec 2016 10:36:51 -0500 Subject: [PATCH 149/183] BUG: GH11847 Unstack with mixed dtypes coerces everything to object closes #11847 Changed the way in which the original data frame is copied (dropped use of .values, since it does not preserve dtypes). Author: Pawel Kordek Closes #14053 from kordek/#11847 and squashes the following commits: 6a381ce [Pawel Kordek] BUG: GH11847 Unstack with mixed dtypes coerces everything to object --- doc/source/whatsnew/v0.19.2.txt | 2 ++ pandas/core/reshape.py | 6 +++-- pandas/tests/frame/test_reshape.py | 40 ++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 0567a3c3fa2bb..b4674345dcb96 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -76,3 +76,5 @@ Bug Fixes - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) + +- Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 055a0041b181a..89317d51f722d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -277,7 +277,8 @@ def _unstack_multiple(data, clocs): verify_integrity=False) if isinstance(data, Series): - dummy = Series(data.values, index=dummy_index) + dummy = data.copy() + dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames @@ -292,7 +293,8 @@ def _unstack_multiple(data, clocs): return result - dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) + dummy = data.copy() + dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 8b1b1130dc2fc..6b0dd38cdb82c 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -282,6 +282,46 @@ def test_unstack_fill_frame_categorical(self): index=list('xyz')) assert_frame_equal(result, expected) + def test_unstack_preserve_dtypes(self): + # Checks fix for #11847 + df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'], + index=['a', 'b', 'c'], + some_categories=pd.Series(['a', 'b', 'c'] + ).astype('category'), + A=np.random.rand(3), + B=1, + C='foo', + D=pd.Timestamp('20010102'), + E=pd.Series([1.0, 50.0, 100.0] + ).astype('float32'), + F=pd.Series([3.0, 4.0, 5.0]).astype('float64'), + G=False, + H=pd.Series([1, 200, 923442], dtype='int8'))) + + def unstack_and_compare(df, column_name): + unstacked1 = df.unstack([column_name]) + unstacked2 = df.unstack(column_name) + assert_frame_equal(unstacked1, unstacked2) + + df1 = df.set_index(['state', 'index']) + unstack_and_compare(df1, 'index') + + df1 = df.set_index(['state', 'some_categories']) + unstack_and_compare(df1, 'some_categories') + + df1 = df.set_index(['F', 'C']) + unstack_and_compare(df1, 'F') + + df1 = df.set_index(['G', 'B', 'state']) + unstack_and_compare(df1, 'B') + + df1 = df.set_index(['E', 'A']) + unstack_and_compare(df1, 'E') + + df1 = df.set_index(['state', 'index']) + s = df1['A'] + unstack_and_compare(s, 'index') + def test_stack_ints(self): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) From e991141f3c13117035fb2a08f4e64b4c18c9d5a9 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Sat, 10 Dec 2016 10:40:17 -0500 Subject: [PATCH 150/183] BUG: Allow TZ-aware DatetimeIndex in merge_asof() (#14844) closes #14844 Author: Christopher C. Aycock Closes #14845 from chrisaycock/GH14844 and squashes the following commits: 97b73a8 [Christopher C. Aycock] BUG: Allow TZ-aware DatetimeIndex in merge_asof() (#14844) --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/tools/merge.py | 4 ++-- pandas/tools/tests/test_merge_asof.py | 24 ++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index b4674345dcb96..a9897f389fe12 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -74,6 +74,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`) +- Bug in ``pd.merge_asof()`` could not handle timezone-aware DatetimeIndex when a tolerance was specified (:issue:`14844`) - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 8d2f92ad58a88..68953c90676dd 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1021,7 +1021,7 @@ def _get_merge_keys(self): msg = "incompatible tolerance, must be compat " \ "with type {0}".format(type(lt)) - if is_datetime64_dtype(lt): + if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): raise MergeError(msg) if self.tolerance < Timedelta(0): @@ -1034,7 +1034,7 @@ def _get_merge_keys(self): raise MergeError("tolerance must be positive") else: - raise MergeError(msg) + raise MergeError("key must be integer or timestamp") # validate allow_exact_matches if not is_bool(self.allow_exact_matches): diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index f413618624592..5c8f424bde7a5 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -1,6 +1,7 @@ import nose import os +import pytz import numpy as np import pandas as pd from pandas import (merge_asof, read_csv, @@ -293,6 +294,29 @@ def test_tolerance(self): expected = self.tolerance assert_frame_equal(result, expected) + def test_tolerance_tz(self): + # GH 14844 + left = pd.DataFrame( + {'date': pd.DatetimeIndex(start=pd.to_datetime('2016-01-02'), + freq='D', periods=5, + tz=pytz.timezone('UTC')), + 'value1': np.arange(5)}) + right = pd.DataFrame( + {'date': pd.DatetimeIndex(start=pd.to_datetime('2016-01-01'), + freq='D', periods=5, + tz=pytz.timezone('UTC')), + 'value2': list("ABCDE")}) + result = pd.merge_asof(left, right, on='date', + tolerance=pd.Timedelta('1 day')) + + expected = pd.DataFrame( + {'date': pd.DatetimeIndex(start=pd.to_datetime('2016-01-02'), + freq='D', periods=5, + tz=pytz.timezone('UTC')), + 'value1': np.arange(5), + 'value2': list("BCDEE")}) + assert_frame_equal(result, expected) + def test_allow_exact_matches(self): result = merge_asof(self.trades, self.quotes, From cb2d6eb53ebe14f10a713a419a39faa545446a3e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Dec 2016 13:00:42 -0500 Subject: [PATCH 151/183] CLN: lint of test_base.py --- pandas/tests/indexes/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2dfeb7da07a3d..9be4935716989 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -6,7 +6,7 @@ from pandas.indexes.api import Index, MultiIndex from .common import Base -from pandas.compat import (is_platform_windows, range, lrange, lzip, u, +from pandas.compat import (range, lrange, lzip, u, zip, PY3, PY36) import operator import os From bca7be90d5513c68745790b1a1ffa1e7a892cb1a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 10 Dec 2016 13:31:11 -0800 Subject: [PATCH 152/183] Frame benchmarking sum instead of mean (#14824) --- asv_bench/benchmarks/frame_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 3daffb9d3a1cc..8cbf5b8d97b70 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -299,7 +299,7 @@ def time_apply_axis_1(self): self.df.apply((lambda x: (x + 1)), axis=1) def time_apply_lambda_mean(self): - self.df.apply((lambda x: x.sum())) + self.df.apply((lambda x: x.mean())) def time_apply_np_mean(self): self.df.apply(np.mean) From b6de920d8c3c2becc46b4fe233e9f388947554f2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 10 Dec 2016 13:35:41 -0800 Subject: [PATCH 153/183] TST: Correct results with np.size and crosstab (#4003) (#14755) missing assert --- pandas/tools/tests/test_pivot.py | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 26f80f463d609..5e800c02c9509 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1281,6 +1281,41 @@ def test_crosstab_with_categorial_columns(self): columns=expected_columns) tm.assert_frame_equal(result, expected) + def test_crosstab_with_numpy_size(self): + # GH 4003 + df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, + 'B': ['A', 'B', 'C'] * 8, + 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, + 'D': np.random.randn(24), + 'E': np.random.randn(24)}) + result = pd.crosstab(index=[df['A'], df['B']], + columns=[df['C']], + margins=True, + aggfunc=np.size, + values=df['D']) + expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'], + ['', 'A', 'B', 'C']], + labels=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], + [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=['A', 'B']) + expected_column = pd.Index(['bar', 'foo', 'All'], + dtype='object', + name='C') + expected_data = np.array([[2., 2., 4.], + [2., 2., 4.], + [2., 2., 4.], + [2., np.nan, 2.], + [np.nan, 2., 2.], + [2., np.nan, 2.], + [np.nan, 2., 2.], + [2., np.nan, 2.], + [np.nan, 2., 2.], + [12., 12., 24.]]) + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_column) + tm.assert_frame_equal(result, expected) + if __name__ == '__main__': import nose From 81a2f792a342a047bcc98b45cd6e99e6d82010b3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Dec 2016 17:41:22 -0500 Subject: [PATCH 154/183] BLD: escape GH_TOKEN in build_docs --- ci/build_docs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 79f004319c388..6dcd291b5a72f 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -56,7 +56,8 @@ if [ x"$DOC_BUILD" != x"" ]; then git add --all . git commit -m "Version" --allow-empty git remote remove origin - git remote add origin "https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis.git" + echo "https://${GH_TOKEN}@github.com/pandas-docs/pandas-docs-travis.git" + git remote add origin "https://${GH_TOKEN}@github.com/pandas-docs/pandas-docs-travis.git" git push origin gh-pages -f fi From 1b0fecd4b8db4efc022aaa2ecdc8cca5900a1e9d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 11 Dec 2016 06:06:54 -0800 Subject: [PATCH 155/183] TST: Test DatetimeIndex weekend offset (#14853) --- pandas/tests/indexes/test_datetimelike.py | 35 ++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 68db163be6fde..0017271fe6c97 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from datetime import datetime, timedelta, time +from datetime import datetime, timedelta, time, date import numpy as np @@ -348,6 +348,19 @@ def test_construction_outofbounds(self): # can't create DatetimeIndex DatetimeIndex(dates) + def test_construction_with_ndarray(self): + # GH 5152 + dates = [datetime(2013, 10, 7), + datetime(2013, 10, 8), + datetime(2013, 10, 9)] + data = DatetimeIndex(dates, freq=pd.tseries.frequencies.BDay()).values + result = DatetimeIndex(data, freq=pd.tseries.frequencies.BDay()) + expected = DatetimeIndex(['2013-10-07', + '2013-10-08', + '2013-10-09'], + freq='B') + tm.assert_index_equal(result, expected) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -748,6 +761,26 @@ def test_difference_freq(self): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal('freq', idx_diff, expected) + def test_week_of_month_frequency(self): + # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise + d1 = date(2002, 9, 1) + d2 = date(2013, 10, 27) + d3 = date(2012, 9, 30) + idx1 = DatetimeIndex([d1, d2]) + idx2 = DatetimeIndex([d3]) + result_append = idx1.append(idx2) + expected = DatetimeIndex([d1, d2, d3]) + tm.assert_index_equal(result_append, expected) + result_union = idx1.union(idx2) + expected = DatetimeIndex([d1, d3, d2]) + tm.assert_index_equal(result_union, expected) + + # GH 5115 + result = date_range("2013-1-1", periods=4, freq='WOM-1SAT') + dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06'] + expected = DatetimeIndex(dates, freq='WOM-1SAT') + tm.assert_index_equal(result, expected) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex From e1f9b966eb642c073b610df87a248b3a6d0e5588 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 11 Dec 2016 12:09:39 -0500 Subject: [PATCH 156/183] BLD: new access token on pandas-dev --- .travis.yml | 2 +- ci/build_docs.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4be2bf2a105ff..1ac59a1219326 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ env: global: # pandas-docs-travis GH - - secure: Oz6cwVu3NetKZ5nKLJ4RQQBbRJE4KF3J5fkVwICJ2SQaE00ng8os4zJRGSkf0g+K1AVJpQ9A1XKG/IOKMBSkGiXBaVR/Qk/5b+QOTjEhgQBd7tdYjBrFfzpn0AIWn+70nAh24pvuGmq5MU9ILUXwNVVM87FF7cJ7efNseveh7Ss= + secure: "YvvTc+FrSYHgdxqoxn9s8VOaCWjvZzlkaf6k55kkmQqCYR9dPiLMsot1F96/N7o3YlD1s0znPQCak93Du8HHi/8809zAXloTaMSZrWz4R4qn96xlZFRE88O/w/Z1t3VVYpKX3MHlCggBc8MtXrqmvWKJMAqXyysZ4TTzoiJDPvE=" git: # for cloning diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 6dcd291b5a72f..4dc9a203f1978 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -56,8 +56,7 @@ if [ x"$DOC_BUILD" != x"" ]; then git add --all . git commit -m "Version" --allow-empty git remote remove origin - echo "https://${GH_TOKEN}@github.com/pandas-docs/pandas-docs-travis.git" - git remote add origin "https://${GH_TOKEN}@github.com/pandas-docs/pandas-docs-travis.git" + git remote add origin "https://${PANDAS_GH_TOKEN}@github.com/pandas-docs/pandas-docs-travis.git" git push origin gh-pages -f fi From 428c106a051e6adfbac94a39022e8e7d89ef4bdb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 11 Dec 2016 14:57:42 -0500 Subject: [PATCH 157/183] BLD: missing - on secure --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1ac59a1219326..be167451f3460 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ env: global: # pandas-docs-travis GH - secure: "YvvTc+FrSYHgdxqoxn9s8VOaCWjvZzlkaf6k55kkmQqCYR9dPiLMsot1F96/N7o3YlD1s0znPQCak93Du8HHi/8809zAXloTaMSZrWz4R4qn96xlZFRE88O/w/Z1t3VVYpKX3MHlCggBc8MtXrqmvWKJMAqXyysZ4TTzoiJDPvE=" + - secure: "YvvTc+FrSYHgdxqoxn9s8VOaCWjvZzlkaf6k55kkmQqCYR9dPiLMsot1F96/N7o3YlD1s0znPQCak93Du8HHi/8809zAXloTaMSZrWz4R4qn96xlZFRE88O/w/Z1t3VVYpKX3MHlCggBc8MtXrqmvWKJMAqXyysZ4TTzoiJDPvE=" git: # for cloning From 602cc4682a9f70e3e2f8b4261a68ad36bdd4a249 Mon Sep 17 00:00:00 2001 From: xgdgsc Date: Mon, 12 Dec 2016 04:26:35 +0800 Subject: [PATCH 158/183] DOC: warning section on memory overflow when joining/merging dataframes on index with duplicate keys (#14788) closes #14736 --- doc/source/merging.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c6541a26c72b4..f95987afd4c77 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -692,6 +692,29 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=['left', 'right'], vertical=False); plt.close('all'); + +Here is another example with duplicate join keys in DataFrames: + +.. ipython:: python + + left = pd.DataFrame({'A' : [1,2], 'B' : [2, 2]}) + + right = pd.DataFrame({'A' : [4,5,6], 'B': [2,2,2]}) + + result = pd.merge(left, right, on='B', how='outer') + +.. ipython:: python + :suppress: + + @savefig merging_merge_on_key_dup.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. warning:: + + Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, + may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. .. _merging.indicator: From e833096244d71c7253cf763556f51f0bece1d6f4 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 11 Dec 2016 16:53:54 -0500 Subject: [PATCH 159/183] BUG: Apply min_itemsize to index even when not appending closes #10381 Author: Pietro Battiston Closes #14812 from toobaz/to_hdf_min_itemsize and squashes the following commits: c07f1e4 [Pietro Battiston] Whatsnew 38b8fcc [Pietro Battiston] Tests for previous commit c838afa [Pietro Battiston] BUG: set min_itemsize even when there is no need to validate (#10381) --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/pytables.py | 5 ++--- pandas/io/tests/test_pytables.py | 36 ++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index a9897f389fe12..231297df3fb8f 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -60,6 +60,7 @@ Bug Fixes - Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) - Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`) +- Bug when writing to a ``HDFStore`` in ``table`` format with a ``min_itemsize`` value for the ``index`` and without asking to append (:issue:`10381`) - Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`) - Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 693606fdd1d32..e474aeab1f6ca 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3429,9 +3429,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, j = len(self.index_axes) # check for column conflicts - if validate: - for a in self.axes: - a.maybe_set_size(min_itemsize=min_itemsize) + for a in self.axes: + a.maybe_set_size(min_itemsize=min_itemsize) # reindex by our non_index_axes & compute data_columns for a in self.non_index_axes: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d621797558c8f..b23d0b89fe850 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1372,6 +1372,22 @@ def check_col(key, name, size): min_itemsize={'index': 4}) tm.assert_series_equal(store.select('ss2'), df['B']) + # min_itemsize in index without appending (GH 10381) + store.put('ss3', df, format='table', + min_itemsize={'index': 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C='longer').set_index('C') + store.append('ss3', df2) + tm.assert_frame_equal(store.select('ss3'), + pd.concat([df, df2])) + + # same as above, with a Series + store.put('ss4', df['B'], format='table', + min_itemsize={'index': 6}) + store.append('ss4', df2['B']) + tm.assert_series_equal(store.select('ss4'), + pd.concat([df['B'], df2['B']])) + # with nans _maybe_remove(store, 'df') df = tm.makeTimeDataFrame() @@ -1426,6 +1442,26 @@ def check_col(key, name, size): self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo': 20, 'foobar': 20}) + def test_to_hdf_with_min_itemsize(self): + + with ensure_clean_path(self.path) as path: + + # min_itemsize in index with to_hdf (GH 10381) + df = tm.makeMixedDataFrame().set_index('C') + df.to_hdf(path, 'ss3', format='table', min_itemsize={'index': 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C='longer').set_index('C') + df2.to_hdf(path, 'ss3', append=True, format='table') + tm.assert_frame_equal(pd.read_hdf(path, 'ss3'), + pd.concat([df, df2])) + + # same as above, with a Series + df['B'].to_hdf(path, 'ss4', format='table', + min_itemsize={'index': 6}) + df2['B'].to_hdf(path, 'ss4', append=True, format='table') + tm.assert_series_equal(pd.read_hdf(path, 'ss4'), + pd.concat([df['B'], df2['B']])) + def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: From 0c82abef8bc8fc7419a3b8823229cafd195595ac Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Sun, 11 Dec 2016 14:23:49 -0800 Subject: [PATCH 160/183] BUG: astype falsely converts inf to integer (GH14265) (#14343) --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/sparse/tests/test_array.py | 2 +- pandas/tests/frame/test_dtypes.py | 14 +++++++++++--- pandas/tests/series/test_dtypes.py | 14 +++++++++++--- pandas/types/cast.py | 6 ++++-- 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f534c67273560..8fdef39a3ae98 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -118,3 +118,5 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) \ No newline at end of file diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 1c9b6119cf665..f210f70ad1940 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -361,7 +361,7 @@ def test_astype(self): arr.astype('i8') arr = SparseArray([0, np.nan, 0, 1], fill_value=0) - msg = "Cannot convert NA to integer" + msg = 'Cannot convert non-finite values \(NA or inf\) to integer' with tm.assertRaisesRegexp(ValueError, msg): arr.astype('i8') diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 817770b9da610..61030c262a44b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -353,9 +353,17 @@ def test_astype_with_view(self): tf = self.frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa - def test_astype_cast_nan_int(self): - df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) - self.assertRaises(ValueError, df.astype, np.int64) + def test_astype_cast_nan_inf_int(self): + # GH14265, check nan and inf raise error when converting to int + types = [np.int32, np.int64] + values = [np.nan, np.inf] + msg = 'Cannot convert non-finite values \(NA or inf\) to integer' + + for this_type in types: + for this_val in values: + df = DataFrame([this_val]) + with tm.assertRaisesRegexp(ValueError, msg): + df.astype(this_type) def test_astype_str(self): # GH9757 diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 9a406dfa10c35..3eafbaf912797 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -42,9 +42,17 @@ def test_dtype(self): assert_series_equal(self.ts.get_ftype_counts(), Series( 1, ['float64:dense'])) - def test_astype_cast_nan_int(self): - df = Series([1.0, 2.0, 3.0, np.nan]) - self.assertRaises(ValueError, df.astype, np.int64) + def test_astype_cast_nan_inf_int(self): + # GH14265, check nan and inf raise error when converting to int + types = [np.int32, np.int64] + values = [np.nan, np.inf] + msg = 'Cannot convert non-finite values \(NA or inf\) to integer' + + for this_type in types: + for this_val in values: + s = Series([this_val]) + with self.assertRaisesRegexp(ValueError, msg): + s.astype(this_type) def test_astype_cast_object_int(self): arr = Series(["car", "house", "tree", "1"]) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index a79862eb195b6..d4beab5655e5c 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -527,8 +527,10 @@ def _astype_nansafe(arr, dtype, copy=True): elif (np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer)): - if np.isnan(arr).any(): - raise ValueError('Cannot convert NA to integer') + if not np.isfinite(arr).all(): + raise ValueError('Cannot convert non-finite values (NA or inf) to ' + 'integer') + elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): # work around NumPy brokenness, #1987 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) From dfe82304a14c48c5139eb899713589338f3f0d0d Mon Sep 17 00:00:00 2001 From: Harshit Patni Date: Mon, 12 Dec 2016 03:10:35 -0800 Subject: [PATCH 161/183] DOC: add floats and ints missing as acceptable arguments for pandas.to_datetime (#14864) --- pandas/tseries/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 326bc5be3fd8f..21e1c9744aa88 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -183,7 +183,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, Parameters ---------- - arg : string, datetime, list, tuple, 1-d array, Series + arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 From 96b171a6593fdab6b4b20157bf4d2e8bd72c5fb2 Mon Sep 17 00:00:00 2001 From: "hesham.shabana@hotmail.com" Date: Mon, 12 Dec 2016 10:34:17 +0200 Subject: [PATCH 162/183] DOC: fix groupby.rst for building issues closes #14861 closes #14863 --- doc/source/groupby.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index ff97775afc2e2..f3fcd6901a440 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -631,11 +631,11 @@ the column B based on the groups of column A. .. ipython:: python - df = pd.DataFrame({'A': [1] * 10 + [5] * 10, - 'B': np.arange(20)}) - df - - df.groupby('A').rolling(4).B.mean() + df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10, + 'B': np.arange(20)}) + df_re + + df_re.groupby('A').rolling(4).B.mean() The ``expanding()`` method will accumulate a given operation @@ -644,7 +644,7 @@ group. .. ipython:: python - df.groupby('A').expanding().sum() + df_re.groupby('A').expanding().sum() Suppose you want to use the ``resample()`` method to get a daily @@ -653,14 +653,14 @@ missing values with the ``ffill()`` method. .. ipython:: python - df = pd.DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') - df + df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', + periods=4, + freq='W'), + 'group': [1, 1, 2, 2], + 'val': [5, 6, 7, 8]}).set_index('date') + df_re - df.groupby('group').resample('1D').ffill() + df_re.groupby('group').resample('1D').ffill() .. _groupby.filter: From 14e4815391dcd8c9fe91479fed629410bf63ca33 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 8 Dec 2016 12:52:53 -0500 Subject: [PATCH 163/183] BF: boost min cython to 0.23 closes #14699 closes #14831 closes #14508 --- ci/install_travis.sh | 2 +- ci/requirements-2.7.build | 2 +- ci/requirements-2.7_COMPAT.build | 2 +- ci/requirements-2.7_LOCALE.build | 2 +- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.16.1.txt | 0 doc/source/whatsnew/v0.17.1.txt | 0 doc/source/whatsnew/v0.20.0.txt | 20 ++++++++++++++++++-- doc/sphinxext/numpydoc/LICENSE.txt | 0 setup.py | 2 +- 10 files changed, 24 insertions(+), 8 deletions(-) mode change 100755 => 100644 doc/source/whatsnew/v0.16.1.txt mode change 100755 => 100644 doc/source/whatsnew/v0.17.1.txt mode change 100755 => 100644 doc/sphinxext/numpydoc/LICENSE.txt diff --git a/ci/install_travis.sh b/ci/install_travis.sh index bdd2c01f611b2..b9b1115090031 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -107,7 +107,7 @@ if [ "$BUILD_TEST" ]; then # build testing pip uninstall --yes cython - pip install cython==0.19.1 + pip install cython==0.23 ( python setup.py build_ext --inplace && python setup.py develop ) || true else diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index b2e2038faf7c3..836385671d603 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,4 +1,4 @@ python-dateutil=2.4.1 pytz=2013b numpy -cython=0.19.1 +cython=0.23 diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build index 85148069a9e6a..95e3da03f161b 100644 --- a/ci/requirements-2.7_COMPAT.build +++ b/ci/requirements-2.7_COMPAT.build @@ -1,4 +1,4 @@ numpy=1.7.1 -cython=0.19.1 +cython=0.23 dateutil=1.5 pytz=2013b diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build index ada6686f599ca..c17730b912651 100644 --- a/ci/requirements-2.7_LOCALE.build +++ b/ci/requirements-2.7_LOCALE.build @@ -1,4 +1,4 @@ python-dateutil pytz=2013b numpy=1.7.1 -cython=0.19.1 +cython=0.23 diff --git a/doc/source/install.rst b/doc/source/install.rst index 55b6b5fa69efb..d45b8765cfd8a 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -243,7 +243,7 @@ Optional Dependencies ~~~~~~~~~~~~~~~~~~~~~ * `Cython `__: Only necessary to build development - version. Version 0.19.1 or higher. + version. Version 0.23 or higher. * `SciPy `__: miscellaneous statistical functions * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt old mode 100755 new mode 100644 diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt old mode 100755 new mode 100644 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8fdef39a3ae98..8d88a7b4fb215 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -9,6 +9,7 @@ users upgrade to this version. Highlights include: +- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -54,7 +55,7 @@ Other enhancements - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack of sorting or an incorrect key. See :ref:`here ` - + - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) @@ -119,4 +120,19 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) \ No newline at end of file +- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) + + + + + + + + + + + + + + +- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) diff --git a/doc/sphinxext/numpydoc/LICENSE.txt b/doc/sphinxext/numpydoc/LICENSE.txt old mode 100755 new mode 100644 diff --git a/setup.py b/setup.py index 2bef65c9719dc..7a55daa74b1c5 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def is_platform_mac(): import versioneer cmdclass = versioneer.get_cmdclass() -min_cython_ver = '0.19.1' +min_cython_ver = '0.23' try: import Cython ver = Cython.__version__ From 110ac2acadf4d5ebe8ea651838531774913c0f62 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Sun, 11 Sep 2016 16:55:25 -0400 Subject: [PATCH 164/183] Move compression code to io.common._get_handle xref #14576 closes #13340 --- pandas/io/common.py | 63 +++++++++++++++++++++++++++++++++----------- pandas/io/parsers.py | 18 +++---------- 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 7076d5a62b626..70dbb0f446e44 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -287,53 +287,84 @@ def ZipFile(*args, **kwargs): ZipFile = zipfile.ZipFile -def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): +def _get_handle(source, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ - if compression is not None: - if encoding is not None and not compat.PY3: + + f = source + is_path = isinstance(source, compat.string_types) + + # in Python 3, convert BytesIO or fileobjects passed with an encoding + if compat.PY3 and isinstance(source, compat.BytesIO): + from io import TextIOWrapper + + return TextIOWrapper(source, encoding=encoding) + + elif compression is not None: + compression = compression.lower() + if encoding is not None and not compat.PY3 and not is_path: msg = 'encoding + compression not yet supported in Python 2' raise ValueError(msg) + # GZ Compression if compression == 'gzip': import gzip - f = gzip.GzipFile(path, mode) + + f = gzip.GzipFile(source, mode) \ + if is_path else gzip.GzipFile(fileobj=source) + + # BZ Compression elif compression == 'bz2': import bz2 - f = bz2.BZ2File(path, mode) + + if is_path: + f = bz2.BZ2File(source, mode) + + else: + f = bz2.BZ2File(source) if compat.PY3 else StringIO( + bz2.decompress(source.read())) + # Python 2's bz2 module can't take file objects, so have to + # run through decompress manually + + # ZIP Compression elif compression == 'zip': import zipfile - zip_file = zipfile.ZipFile(path) + zip_file = zipfile.ZipFile(source) zip_names = zip_file.namelist() if len(zip_names) == 1: - file_name = zip_names.pop() - f = zip_file.open(file_name) + f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' - .format(path)) + .format(source)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP :{}' .format(zip_names)) + + # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() - f = lzma.LZMAFile(path, mode) + f = lzma.LZMAFile(source, mode) + else: - raise ValueError('Unrecognized compression type: %s' % - compression) + raise ValueError('Unrecognized compression: %s' % compression) + if compat.PY3: from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + return f - else: + + elif is_path: if compat.PY3: if encoding: - f = open(path, mode, encoding=encoding) + f = open(source, mode, encoding=encoding) else: - f = open(path, mode, errors='replace') + f = open(source, mode, errors='replace') else: - f = open(path, mode) + f = open(source, mode) if memory_map and hasattr(f, 'fileno'): try: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 30443f894a64d..bd4c5ac348a44 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1890,20 +1890,10 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - if isinstance(f, compat.string_types): - f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map) - self.handles.append(f) - elif self.compression: - f = _wrap_compressed(f, self.compression, self.encoding) - self.handles.append(f) - # in Python 3, convert BytesIO or fileobjects passed with an encoding - elif compat.PY3 and isinstance(f, compat.BytesIO): - from io import TextIOWrapper - - f = TextIOWrapper(f, encoding=self.encoding) - self.handles.append(f) + f = _get_handle(f, 'r', encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map) + self.handles.append(f) # Set self.data to something that can read lines. if hasattr(f, 'readline'): From 37614485a9740df1c55e7f0da2d32216e2561af1 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Tue, 13 Dec 2016 13:33:18 -0500 Subject: [PATCH 165/183] CLN: Refactor compression code to expand URL support closes #14576 closes #12688 closes #14570 xref #14874 --- pandas/formats/format.py | 6 +- pandas/io/common.py | 203 +++++++++++++------------ pandas/io/json.py | 6 +- pandas/io/parsers.py | 105 ++----------- pandas/io/s3.py | 4 +- pandas/io/tests/parser/compression.py | 5 + pandas/io/tests/parser/test_network.py | 54 ++++--- 7 files changed, 166 insertions(+), 217 deletions(-) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 7706666142a64..0cf6050e515e0 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1455,9 +1455,9 @@ def save(self): f = self.path_or_buf close = False else: - f = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=self.encoding, + compression=self.compression) close = True try: diff --git a/pandas/io/common.py b/pandas/io/common.py index 70dbb0f446e44..b5a3aec490608 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,11 +1,9 @@ """Common IO api utilities""" -import sys import os import csv import codecs import mmap -import zipfile from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type @@ -141,39 +139,6 @@ def _is_s3_url(url): return False -def maybe_read_encoded_stream(reader, encoding=None, compression=None): - """read an encoded stream from the reader and transform the bytes to - unicode if required based on the encoding - - Parameters - ---------- - reader : a streamable file-like object - encoding : optional, the encoding to attempt to read - - Returns - ------- - a tuple of (a stream of decoded bytes, the encoding which was used) - - """ - - if compat.PY3 or encoding is not None: # pragma: no cover - if encoding: - errors = 'strict' - else: - errors = 'replace' - encoding = 'utf-8' - - if compression == 'gzip': - reader = BytesIO(reader.read()) - else: - reader = StringIO(reader.read().decode(encoding, errors)) - else: - if compression == 'gzip': - reader = BytesIO(reader.read()) - encoding = None - return reader, encoding - - def _expand_user(filepath_or_buffer): """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, """ if _is_url(filepath_or_buffer): - req = _urlopen(str(filepath_or_buffer)) - if compression == 'infer': - content_encoding = req.headers.get('Content-Encoding', None) - if content_encoding == 'gzip': - compression = 'gzip' - else: - compression = None - # cat on the compression to the tuple returned by the function - to_return = (list(maybe_read_encoded_stream(req, encoding, - compression)) + - [compression]) - return tuple(to_return) + url = str(filepath_or_buffer) + req = _urlopen(url) + content_encoding = req.headers.get('Content-Encoding', None) + if content_encoding == 'gzip': + # Override compression based on Content-Encoding header + compression = 'gzip' + reader = BytesIO(req.read()) + return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io.s3 import get_filepath_or_buffer @@ -276,95 +237,145 @@ def file_path_to_url(path): return urljoin('file:', pathname2url(path)) -# ZipFile is not a context manager for <= 2.6 -# must be tuple index here since 2.6 doesn't use namedtuple for version_info -if sys.version_info[1] <= 6: - @contextmanager - def ZipFile(*args, **kwargs): - with closing(zipfile.ZipFile(*args, **kwargs)) as zf: - yield zf -else: - ZipFile = zipfile.ZipFile +_compression_to_extension = { + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', +} -def _get_handle(source, mode, encoding=None, compression=None, memory_map=False): - """Gets file handle for given path and mode. +def _infer_compression(filepath_or_buffer, compression): + """ + If compression='infer', infer compression. If compression """ - f = source - is_path = isinstance(source, compat.string_types) + # No compression has been explicitly specified + if compression is None: + return None - # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and isinstance(source, compat.BytesIO): - from io import TextIOWrapper + # Cannot infer compression of a buffer. Hence assume no compression. + is_path = isinstance(filepath_or_buffer, compat.string_types) + if compression == 'infer' and not is_path: + return None + + # Infer compression from the filename/URL extension + if compression == 'infer': + for compression, extension in _compression_to_extension.items(): + if filepath_or_buffer.endswith(extension): + return compression + return None - return TextIOWrapper(source, encoding=encoding) + # Compression has been specified. Check that it's valid + if compression in _compression_to_extension: + return compression - elif compression is not None: - compression = compression.lower() - if encoding is not None and not compat.PY3 and not is_path: - msg = 'encoding + compression not yet supported in Python 2' + msg = 'Unrecognized compression type: {}'.format(compression) + valid = ['infer', None] + sorted(_compression_to_extension) + msg += '\nValid compression types are {}'.format(valid) + raise ValueError(msg) + + +def _get_handle(path_or_buf, mode, encoding=None, compression=None, + memory_map=False): + """ + Get file handle for given path/buffer and mode. + + Parameters + ---------- + path_or_buf : + a path (str) or buffer + mode : str + mode to open path_or_buf with + encoding : str or None + compression : str or None + Supported compression protocols are gzip, bz2, zip, and xz + memory_map : boolean, default False + See parsers._parser_params for more information. + + Returns + ------- + f : file-like + A file-like object + handles : list of file-like objects + A list of file-like object that were openned in this function. + """ + + handles = list() + f = path_or_buf + is_path = isinstance(path_or_buf, compat.string_types) + + if compression: + + if compat.PY2 and not is_path and encoding: + msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip - - f = gzip.GzipFile(source, mode) \ - if is_path else gzip.GzipFile(fileobj=source) + if is_path: + f = gzip.open(path_or_buf, mode) + else: + f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': import bz2 - if is_path: - f = bz2.BZ2File(source, mode) - - else: - f = bz2.BZ2File(source) if compat.PY3 else StringIO( - bz2.decompress(source.read())) + f = bz2.BZ2File(path_or_buf, mode) + elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually + f = StringIO(bz2.decompress(path_or_buf.read())) + path_or_buf.close() + else: + f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': import zipfile - zip_file = zipfile.ZipFile(source) + zip_file = zipfile.ZipFile(path_or_buf) zip_names = zip_file.namelist() - if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' - .format(source)) + .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP :{}' + ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() - f = lzma.LZMAFile(source, mode) + f = lzma.LZMAFile(path_or_buf, mode) + # Unrecognized Compression else: - raise ValueError('Unrecognized compression: %s' % compression) - - if compat.PY3: - from io import TextIOWrapper - - f = TextIOWrapper(f, encoding=encoding) + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) - return f + handles.append(f) elif is_path: - if compat.PY3: - if encoding: - f = open(source, mode, encoding=encoding) - else: - f = open(source, mode, errors='replace') + if compat.PY2: + # Python 2 + f = open(path_or_buf, mode) + elif encoding: + # Python 3 and encoding + f = open(path_or_buf, mode, encoding=encoding) else: - f = open(source, mode) + # Python 3 and no explicit encoding + f = open(path_or_buf, mode, errors='replace') + handles.append(f) + + # in Python 3, convert BytesIO or fileobjects passed with an encoding + if compat.PY3 and (compression or isinstance(f, compat.BytesIO)): + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + handles.append(f) if memory_map and hasattr(f, 'fileno'): try: @@ -378,7 +389,7 @@ def _get_handle(source, mode, encoding=None, compression=None, memory_map=False) # leave the file handler as is then pass - return f + return f, handles class MMapWrapper(BaseIterator): diff --git a/pandas/io/json.py b/pandas/io/json.py index 878506a6ddc05..5b1a40736ace3 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: - json = fh.read() + fh, handles = _get_handle(filepath_or_buffer, 'r', + encoding=encoding) + json = fh.read() + fh.close() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bd4c5ac348a44..3cd23150bb0bf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,12 +27,11 @@ from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError -from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, BaseIterator, ParserError, EmptyDataError, - ParserWarning, _NA_VALUES) + ParserWarning, _NA_VALUES, _infer_compression) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -354,37 +353,17 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): - "Generic reader of line files." + """Generic reader of line files.""" encoding = kwds.get('encoding', None) if encoding is not None: encoding = re.sub('_', '-', encoding).lower() kwds['encoding'] = encoding - # If the input could be a filename, check for a recognizable compression - # extension. If we're reading from a URL, the `get_filepath_or_buffer` - # will use header info to determine compression, so use what it finds in - # that case. - inferred_compression = kwds.get('compression') - if inferred_compression == 'infer': - if isinstance(filepath_or_buffer, compat.string_types): - if filepath_or_buffer.endswith('.gz'): - inferred_compression = 'gzip' - elif filepath_or_buffer.endswith('.bz2'): - inferred_compression = 'bz2' - elif filepath_or_buffer.endswith('.zip'): - inferred_compression = 'zip' - elif filepath_or_buffer.endswith('.xz'): - inferred_compression = 'xz' - else: - inferred_compression = None - else: - inferred_compression = None - + compression = kwds.get('compression') + compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, - compression=kwds.get('compression', None)) - kwds['compression'] = (inferred_compression if compression == 'infer' - else compression) + filepath_or_buffer, encoding, compression) + kwds['compression'] = compression if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -1771,70 +1750,6 @@ def count_empty_vals(vals): return sum([1 for v in vals if v == '' or v is None]) -def _wrap_compressed(f, compression, encoding=None): - """wraps compressed fileobject in a decompressing fileobject - NOTE: For all files in Python 3.2 and for bzip'd files under all Python - versions, this means reading in the entire file and then re-wrapping it in - StringIO. - """ - compression = compression.lower() - encoding = encoding or get_option('display.encoding') - - if compression == 'gzip': - import gzip - - f = gzip.GzipFile(fileobj=f) - if compat.PY3: - from io import TextIOWrapper - - f = TextIOWrapper(f) - return f - elif compression == 'bz2': - import bz2 - - if compat.PY3: - f = bz2.open(f, 'rt', encoding=encoding) - else: - # Python 2's bz2 module can't take file objects, so have to - # run through decompress manually - data = bz2.decompress(f.read()) - f = StringIO(data) - return f - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(f) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - f = zip_file.open(file_name) - return f - - elif len(zip_names) == 0: - raise ValueError('Corrupted or zero files found in compressed ' - 'zip file %s', zip_file.filename) - - else: - raise ValueError('Multiple files found in compressed ' - 'zip file %s', str(zip_names)) - - elif compression == 'xz': - - lzma = compat.import_lzma() - f = lzma.LZMAFile(f) - - if compat.PY3: - from io import TextIOWrapper - - f = TextIOWrapper(f) - - return f - - else: - raise ValueError('do not recognize compression method %s' - % compression) - - class PythonParser(ParserBase): def __init__(self, f, **kwds): @@ -1890,10 +1805,10 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map) - self.handles.append(f) + f, handles = _get_handle(f, 'r', encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map) + self.handles.extend(handles) # Set self.data to something that can read lines. if hasattr(f, 'readline'): diff --git a/pandas/io/s3.py b/pandas/io/s3.py index df8f1d9187031..8aa3694834a0a 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -99,9 +99,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, conn = boto.connect_s3(host=s3_host, anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): + if compat.PY2 and compression: k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 47ae7be1cbf05..3b0c571032fe6 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -168,3 +168,8 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) inputs[3].close() + + def test_invalid_compression(self): + msg = 'Unrecognized compression type: sfark' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv('test_file.zip', compression='sfark') diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 9b02096dd0f26..fd7a1babe4e01 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -7,6 +7,8 @@ import os import nose +import functools +from itertools import product import pandas.util.testing as tm from pandas import DataFrame @@ -14,24 +16,40 @@ from pandas.io.parsers import read_csv, read_table -class TestUrlGz(tm.TestCase): - - def setUp(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') - self.local_table = read_table(localtable) - - @tm.network - def test_url_gz(self): - url = ('https://raw.github.com/pandas-dev/pandas/' - 'master/pandas/io/tests/parser/data/salaries.csv.gz') - url_table = read_table(url, compression="gzip", engine="python") - tm.assert_frame_equal(url_table, self.local_table) - - @tm.network - def test_url_gz_infer(self): - url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz' - url_table = read_table(url, compression="infer", engine="python") +class TestCompressedUrl(object): + + compression_to_extension = { + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', + } + + def __init__(self): + path = os.path.join(tm.get_data_path(), 'salaries.csv') + self.local_table = read_table(path) + self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' + 'pandas/io/tests/parser/data/salaries.csv') + + def test_compressed_urls(self): + """Test reading compressed tables from URL.""" + msg = ('Test reading {}-compressed tables from URL: ' + 'compression="{}", engine="{}"') + + for compression, extension in self.compression_to_extension.items(): + url = self.base_url + extension + # args is a (compression, engine) tuple + for args in product([compression, 'infer'], ['python']): + # test_fxn is a workaround for more descriptive nose reporting. + # See http://stackoverflow.com/a/37393684/4651668. + test_fxn = functools.partial(self.check_table) + test_fxn.description = msg.format(compression, *args) + yield (test_fxn, url) + args + + def check_table(self, url, compression, engine): + if url.endswith('.xz'): + tm._skip_if_no_lzma() + url_table = read_table(url, compression=compression, engine=engine) tm.assert_frame_equal(url_table, self.local_table) From 4a5aec40e8b2d6789f946e3e5b5b07ba5e753eb6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 13 Dec 2016 18:07:56 -0500 Subject: [PATCH 166/183] DOC: doc-string for infer_compression --- pandas/io/common.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index b5a3aec490608..c115fab217fba 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -247,6 +247,22 @@ def file_path_to_url(path): def _infer_compression(filepath_or_buffer, compression): """ + Get file handle for given path/buffer and mode. + + Parameters + ---------- + filepath_or_buf : + a path (str) or buffer + compression : str, or None + + Returns + ------- + string compression method, None + + Raises + ------ + ValueError on invalid compression specified + If compression='infer', infer compression. If compression """ From 7d8bc0deaeb8237a0cf361048363c78f4867f218 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Tue, 13 Dec 2016 18:16:32 -0500 Subject: [PATCH 167/183] ENH: Added errors{'raise','ignore'} for keys not found in meta for json_normalize Author: dickreuter Closes #14583 from dickreuter/json_normalize_enhancement and squashes the following commits: 701c140 [dickreuter] adjusted formatting 3c94206 [dickreuter] shortened lines to pass linting 2028924 [dickreuter] doc changes d298588 [dickreuter] Fixed as instructed in pull request page bcfbf18 [dickreuter] Avoids exception when pandas.io.json.json_normalize --- doc/source/whatsnew/v0.20.0.txt | 2 + pandas/io/json.py | 22 +++++++++- pandas/io/tests/json/test_json_norm.py | 59 ++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8d88a7b4fb215..159273928ae1d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -61,6 +61,8 @@ Other enhancements - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) +- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) + .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/json.py b/pandas/io/json.py index 5b1a40736ace3..0a6b8af179e12 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -725,7 +725,9 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, + errors='raise'): + """ "Normalize" semi-structured JSON data into a flat table @@ -742,6 +744,13 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + errors : {'raise', 'ignore'}, default 'raise' + * ignore : will ignore KeyError if keys listed in meta are not + always present + * raise : will raise KeyError if keys listed in meta are not + always present + + .. versionadded:: 0.20.0 Returns ------- @@ -841,7 +850,16 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise \ + KeyError("Try running with " + "errors='ignore' as key " + "%s is not always present", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..36110898448ea 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,65 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalize_errors(self): + # GH14583: If meta keys are not always present + # a new option to set errors='ignore' has been implemented + i = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + } + ] + } + }, { + "general": { + "tradeid": 100, + "stocks": [{ + "symbol": "AAPL", + "name": "Apple", + "price": "0" + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + } + ] + } + } + ] + } + j = json_normalize(data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], + errors='ignore') + expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) + + self.assertRaises(KeyError, + json_normalize, data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], + errors='raise' + ) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From 86233e15193c3bcd0f646915891ca6c7892335d9 Mon Sep 17 00:00:00 2001 From: nuffe Date: Tue, 13 Dec 2016 18:26:07 -0500 Subject: [PATCH 168/183] ENH/DOC: wide_to_long performance and docstring clarification closes #14778 Please see regex search on long columns by first converting to Categorical, avoid melting all dataframes with all the id variables, and wait with trying to convert the "time" variable to `int` until last), and clear up the docstring. Author: nuffe Closes #14779 from nuffe/wide2longfix and squashes the following commits: df1edf8 [nuffe] asv_bench: fix indentation and simplify dc13064 [nuffe] Set docstring to raw literal to allow backslashes to be printed (still had to escape them) 295d1e6 [nuffe] Use pd.Index in doc example 1c49291 [nuffe] Can of course get rid negative lookahead now that suffix is a regex 54c5920 [nuffe] Specify the suffix with a regex 5747a25 [nuffe] ENH/DOC: wide_to_long performance and functionality improvements (#14779) --- asv_bench/benchmarks/reshape.py | 24 +++- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/reshape.py | 214 ++++++++++++++++++++++++++------ pandas/tests/test_reshape.py | 201 +++++++++++++++++++++++++++++- 5 files changed, 402 insertions(+), 39 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index ab235e085986c..a3ecfff52c794 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas.core.reshape import melt +from pandas.core.reshape import melt, wide_to_long class melt_dataframe(object): @@ -74,3 +74,25 @@ def setup(self): def time_unstack_sparse_keyspace(self): self.idf.unstack() + + +class wide_to_long_big(object): + goal_time = 0.2 + + def setup(self): + vars = 'ABCD' + nyrs = 20 + nidvars = 20 + N = 5000 + yrvars = [] + for var in vars: + for yr in range(1, nyrs + 1): + yrvars.append(var + str(yr)) + + self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)), + columns=list(range(nidvars)) + yrvars) + self.vars = vars + + def time_wide_to_long_big(self): + self.df['id'] = self.df.index + wide_to_long(self.df, list(self.vars), i='id', j='year') diff --git a/doc/source/api.rst b/doc/source/api.rst index 929664840f583..b8157929bd940 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -157,6 +157,7 @@ Data manipulations concat get_dummies factorize + wide_to_long Top-level missing data ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 159273928ae1d..d0009efd2d994 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -113,6 +113,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 89317d51f722d..b359c54535b28 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -3,6 +3,7 @@ from pandas.compat import range, zip from pandas import compat import itertools +import re import numpy as np @@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j): - """ +def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): + r""" Wide panel to long format. Less flexible but more user-friendly than melt. + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j='year'`) + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + Parameters ---------- df : DataFrame The wide-format DataFrame - stubnames : list - A list of stub names. The wide format variables are assumed to + stubnames : str or list-like + The stub name(s). The wide format variables are assumed to start with the stub names. - i : str - The name of the id variable. + i : str or list-like + Column(s) to use as id variable(s) j : str - The name of the subobservation variable. - stubend : str - Regex to match for the end of the stubs. + The name of the subobservation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hypen by specifying `sep='-'` + + .. versionadded:: 0.20.0 + + suffix : str, default '\\d+' + A regular expression capturing the wanted suffixes. '\\d+' captures + numeric suffixes. Suffixes with no numbers could be specified with the + negated character class '\\D+'. You can also further disambiguate + suffixes, for example, if your wide variables are of the form + Aone, Btwo,.., and you have an unrelated column Arating, you can + ignore the last one by specifying `suffix='(!?one|two)'` + + .. versionadded:: 0.20.0 Returns ------- DataFrame - A DataFrame that contains each stub name as a variable as well as - variables for i and j. + A DataFrame that contains each stub name as a variable, with new index + (i, j) Examples -------- @@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j): 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 - >>> wide_to_long(df, ["A", "B"], i="id", j="year") + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") X A B id year 0 1970 -1.085631 a 2.5 @@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j): 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 + With multuple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + birth famid ht1 ht2 + 0 1 1 2.8 3.4 + 1 2 1 2.9 3.8 + 2 3 1 2.2 2.9 + 3 1 2 2.0 3.2 + 4 2 2 1.8 2.8 + 5 3 2 1.9 2.4 + 6 1 3 2.2 3.3 + 7 2 3 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() + >>> w.columns = pd.Index(w.columns).str.join('') + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), + ... 'A(quarterly)-2011': np.random.rand(3), + ... 'B(quarterly)-2010': np.random.rand(3), + ... 'B(quarterly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 + 0 0.531828 0.724455 0.322959 0.293714 + 1 0.634401 0.611024 0.361789 0.630976 + 2 0.849432 0.722443 0.228263 0.092105 + \ + X id + 0 0 0 + 1 1 1 + 2 2 2 + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], + i='id', j='year', sep='-') + X A(quarterly) B(quarterly) + id year + 0 2010 0 0.531828 0.322959 + 1 2010 2 0.634401 0.361789 + 2 2010 2 0.849432 0.228263 + 0 2011 0 0.724455 0.293714 + 1 2011 2 0.611024 0.630976 + 2 2011 2 0.722443 0.092105 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = set([match[0] for match in + df.columns.str.findall('[A-B]\(.*\)').values + if match != [] ]) + >>> list(stubnames) + ['B(quarterly)', 'A(quarterly)'] + Notes ----- - All extra variables are treated as extra id variables. This simply uses + All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ - - def get_var_names(df, regex): + def get_var_names(df, stub, sep, suffix): + regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix) return df.filter(regex=regex).columns.tolist() - def melt_stub(df, stub, i, j): - varnames = get_var_names(df, "^" + stub) - newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, - var_name=j) - newdf_j = newdf[j].str.replace(stub, "") - try: - newdf_j = newdf_j.astype(int) - except ValueError: - pass - newdf[j] = newdf_j - return newdf - - id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) - if i not in id_vars: - id_vars += [i] - - newdf = melt_stub(df, stubnames[0], id_vars, j) - - for stub in stubnames[1:]: - new = melt_stub(df, stub, id_vars, j) - newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) - return newdf.set_index([i, j]) + def melt_stub(df, stub, i, j, value_vars, sep): + newdf = melt(df, id_vars=i, value_vars=value_vars, + value_name=stub.rstrip(sep), var_name=j) + newdf[j] = Categorical(newdf[j]) + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + + return newdf.set_index(i + [j]) + + if any(map(lambda s: s in df.columns.tolist(), stubnames)): + raise ValueError("stubname can't be identical to a column name") + + if not is_list_like(stubnames): + stubnames = [stubnames] + else: + stubnames = list(stubnames) + + if not is_list_like(i): + i = [i] + else: + i = list(i) + + value_vars = list(map(lambda stub: + get_var_names(df, stub, sep, suffix), stubnames)) + + value_vars_flattened = [e for sublist in value_vars for e in sublist] + id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + + melted = [] + for s, v in zip(stubnames, value_vars): + melted.append(melt_stub(df, s, i, j, v, sep)) + melted = melted[0].join(melted[1:], how='outer') + + if len(i) == 1: + new = df[id_vars].set_index(i).join(melted) + return new + + new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) + + return new def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 80d1f5f76e5a9..603674ac01bc0 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -698,7 +698,7 @@ def test_simple(self): exp_data = {"X": x.tolist() + x.tolist(), "A": ['a', 'b', 'c', 'd', 'e', 'f'], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], "id": [0, 1, 2, 0, 1, 2]} exp_frame = DataFrame(exp_data) exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] @@ -716,6 +716,205 @@ def test_stubs(self): self.assertEqual(stubs, ['inc', 'edu']) + def test_separating_character(self): + # GH14779 + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A.1970": {0: "a", + 1: "b", + 2: "c"}, + "A.1980": {0: "d", + 1: "e", + 2: "f"}, + "B.1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B.1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A(quarterly)1970": {0: "a", + 1: "b", + 2: "c"}, + "A(quarterly)1980": {0: "d", + 1: "e", + 2: "f"}, + "B(quarterly)1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B(quarterly)1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index( + ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] + long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], + i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], + 'A': [1.0, 3.0, 2.0, 4.0], + 'B': [5.0, np.nan, 6.0, np.nan], + 'id': [0, 0, 1, 1], + 'year': ['2010', '2011', '2010', '2011']} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'BBBX': [91, 92, 93], + 'BBBZ': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'BBBX': [91, 92, 93, 91, 92, 93], + 'BBBZ': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['BBBX', 'BBBZ', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = 'nope!' + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'A2010': [], + 'A2011': [], + 'B2010': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'Arating': [91, 92, 93], + 'Arating_old': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'Arating': [91, 92, 93, 91, 92, 93], + 'Arating_old': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['Arating', 'Arating_old', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame({'Aone': [1.0, 2.0], + 'Atwo': [3.0, 4.0], + 'Bone': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'Aone': [], + 'Atwo': [], + 'Bone': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame({ + 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + }) + exp_frame = pd.DataFrame({ + 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, + 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], + 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', + '2', '1', '2', '1', '2', '1', '2', '1', '2'] + }) + exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] + long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + tm.assert_frame_equal(long_frame, exp_frame) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 510dd6706b149969a749957e8cfdabf7cfd4bd58 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 14 Dec 2016 03:07:26 -0800 Subject: [PATCH 169/183] TST: Parse dates with empty space (#6428) (#14862) + Add doc explaining parse_date limitation --- doc/source/io.rst | 6 ++++++ pandas/io/parsers.py | 4 ++++ pandas/io/tests/test_date_converters.py | 13 +++++++++++++ pandas/tseries/tests/test_timeseries.py | 12 ++++++++++++ 4 files changed, 35 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 75f36c5274cd2..17c7653072526 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -867,6 +867,12 @@ data columns: index_col=0) #index is the nominal column df +.. note:: + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use :func:`to_datetime` after ``pd.read_csv``. + + .. note:: read_csv has a fast_path for parsing datetime strings in iso8601 format, e.g "2000-01-01T00:01:02+00:00" and similar variations. If you can arrange diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3cd23150bb0bf..200943324ce66 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -167,6 +167,10 @@ * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv`` + Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : boolean, default False If True and parse_dates is enabled, pandas will attempt to infer the format diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 95fd2d52db009..3a0dd4eaa09e5 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -138,6 +138,19 @@ def date_parser(date, time): names=['datetime', 'prn'])) assert_frame_equal(df, df_correct) + def test_parse_date_column_with_empty_string(self): + # GH 6428 + data = """case,opdate + 7,10/18/2006 + 7,10/18/2008 + 621, """ + result = read_csv(StringIO(data), parse_dates=['opdate']) + expected_data = [[7, '10/18/2006'], + [7, '10/18/2008'], + [621, ' ']] + expected = DataFrame(expected_data, columns=['case', 'opdate']) + assert_frame_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 2c3e5ca126209..beacc21912edc 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -947,6 +947,18 @@ def test_to_datetime_on_datetime64_series(self): result = to_datetime(s) self.assertEqual(result[0], s[0]) + def test_to_datetime_with_space_in_series(self): + # GH 6428 + s = Series(['10/18/2006', '10/18/2008', ' ']) + tm.assertRaises(ValueError, lambda: to_datetime(s, errors='raise')) + result_coerce = to_datetime(s, errors='coerce') + expected_coerce = Series([datetime(2006, 10, 18), + datetime(2008, 10, 18), + pd.NaT]) + tm.assert_series_equal(result_coerce, expected_coerce) + result_ignore = to_datetime(s, errors='ignore') + tm.assert_series_equal(result_ignore, s) + def test_to_datetime_with_apply(self): # this is only locale tested with US/None locales _skip_if_has_locale() From 43928d49171750c8827f1c6e02c416c0f50fdbeb Mon Sep 17 00:00:00 2001 From: adrian-stepien Date: Wed, 14 Dec 2016 15:53:33 +0100 Subject: [PATCH 170/183] DOC: Improved links between expanding and cum* (GH12651) - [x] closes #12651 - [x] passes `git diff upstream/master | flake8 --diff` Author: adrian-stepien Closes #14098 from adrian-stepien/doc/12651 and squashes the following commits: 4427e28 [adrian-stepien] DOC: Improved links between expanding and cum* (#12651) 8466669 [adrian-stepien] DOC: Improved links between expanding and cum* (#12651) 30164f3 [adrian-stepien] DOC: Correct link from b/ffill to fillna --- doc/source/basics.rst | 4 +++- doc/source/computation.rst | 31 +++++++++++++++++++++++++------ pandas/core/generic.py | 34 ++++++++++++++++++++++++---------- 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e5aa6b577270a..e7db814483905 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -486,7 +486,9 @@ standard deviation 1), very concisely: xs_stand.std(1) Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -preserve the location of NA values: +preserve the location of ``NaN`` values. This is somewhat different from +:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`. +For more details please see :ref:`this note `. .. ipython:: python diff --git a/doc/source/computation.rst b/doc/source/computation.rst index d727424750be5..d2b9a0bcab8d9 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -691,6 +691,8 @@ Method Summary :meth:`~Expanding.cov`, Unbiased covariance (binary) :meth:`~Expanding.corr`, Correlation (binary) +.. currentmodule:: pandas + Aside from not having a ``window`` parameter, these functions have the same interfaces as their ``.rolling`` counterparts. Like above, the parameters they all accept are: @@ -700,18 +702,34 @@ all accept are: ``min_periods`` non-null data points have been seen. - ``center``: boolean, whether to set the labels at the center (default is False) +.. _stats.moments.expanding.note: .. note:: The output of the ``.rolling`` and ``.expanding`` methods do not return a ``NaN`` if there are at least ``min_periods`` non-null values in the current - window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and - ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is - encountered in the input. + window. This differs from :meth:`~DataFrame.cumsum`, + :meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`, + and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever + a ``NaN`` is encountered in the input. + + Please see the example below. In order to match the output of ``cumsum`` + with ``expanding``, use :meth:`~DataFrame.fillna`. + + .. ipython:: python + + sn = pd.Series([1,2,np.nan,3,np.nan,4]) + + sn.expanding().sum() + + sn.cumsum() + + sn.cumsum().fillna(method='ffill') + An expanding window statistic will be more stable (and less responsive) than its rolling window counterpart as the increasing window size decreases the relative impact of an individual data point. As an example, here is the -:meth:`~Expanding.mean` output for the previous time series dataset: +:meth:`~core.window.Expanding.mean` output for the previous time series dataset: .. ipython:: python :suppress: @@ -731,13 +749,14 @@ relative impact of an individual data point. As an example, here is the Exponentially Weighted Windows ------------------------------ +.. currentmodule:: pandas.core.window + A related set of functions are exponentially weighted versions of several of the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed -thru the ``.ewm`` method to receive an :class:`~pandas.core.window.EWM` object. +through the ``.ewm`` method to receive an :class:`~EWM` object. A number of expanding EW (exponentially weighted) methods are provided: -.. currentmodule:: pandas.core.window .. csv-table:: :header: "Function", "Description" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 64e3d60e1fe14..3a352e352441b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3354,12 +3354,16 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return self._constructor(new_data).__finalize__(self) def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for NDFrame.fillna(method='ffill')""" + """ + Synonym for :meth:`DataFrame.fillna(method='ffill') ` + """ return self.fillna(method='ffill', axis=axis, inplace=inplace, limit=limit, downcast=downcast) def bfill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for NDFrame.fillna(method='bfill')""" + """ + Synonym for :meth:`DataFrame.fillna(method='bfill') ` + """ return self.fillna(method='bfill', axis=axis, inplace=inplace, limit=limit, downcast=downcast) @@ -5477,16 +5481,18 @@ def compound(self, axis=None, skipna=None, level=None): cls.cummin = _make_cum_function( cls, 'cummin', name, name2, axis_descr, "cumulative minimum", - lambda y, axis: np.minimum.accumulate(y, axis), np.inf, np.nan) + lambda y, axis: np.minimum.accumulate(y, axis), "min", + np.inf, np.nan) cls.cumsum = _make_cum_function( cls, 'cumsum', name, name2, axis_descr, "cumulative sum", - lambda y, axis: y.cumsum(axis), 0., np.nan) + lambda y, axis: y.cumsum(axis), "sum", 0., np.nan) cls.cumprod = _make_cum_function( cls, 'cumprod', name, name2, axis_descr, "cumulative product", - lambda y, axis: y.cumprod(axis), 1., np.nan) + lambda y, axis: y.cumprod(axis), "prod", 1., np.nan) cls.cummax = _make_cum_function( cls, 'cummax', name, name2, axis_descr, "cumulative max", - lambda y, axis: np.maximum.accumulate(y, axis), -np.inf, np.nan) + lambda y, axis: np.maximum.accumulate(y, axis), "max", + -np.inf, np.nan) cls.sum = _make_stat_function( cls, 'sum', name, name2, axis_descr, @@ -5674,7 +5680,15 @@ def _doc_parms(cls): Returns ------- -%(outname)s : %(name1)s\n""" +%(outname)s : %(name1)s\n + + +See also +-------- +pandas.core.window.Expanding.%(accum_func_name)s : Similar functionality + but ignores ``NaN`` values. + +""" def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @@ -5717,10 +5731,10 @@ def stat_func(self, axis=None, skipna=None, level=None, ddof=1, return set_function_name(stat_func, name, cls) -def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, - mask_a, mask_b): +def _make_cum_function(cls, name, name1, name2, axis_descr, desc, + accum_func, accum_func_name, mask_a, mask_b): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, accum_func_name=accum_func_name) @Appender("Return {0} over requested axis.".format(desc) + _cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): From 30025d82564fc27fbab58fbd791009e5b77a23db Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Dec 2016 15:55:59 +0100 Subject: [PATCH 171/183] DOC: update added docs from #14098 --- doc/source/computation.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index d2b9a0bcab8d9..a19a56f6f1905 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -707,22 +707,25 @@ all accept are: The output of the ``.rolling`` and ``.expanding`` methods do not return a ``NaN`` if there are at least ``min_periods`` non-null values in the current - window. This differs from :meth:`~DataFrame.cumsum`, + window. For example, + + .. ipython:: python + + sn = pd.Series([1, 2, np.nan, 3, np.nan, 4]) + sn + sn.rolling(2).max() + sn.rolling(2, min_periods=1).max() + + In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`, :meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`, and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever - a ``NaN`` is encountered in the input. - - Please see the example below. In order to match the output of ``cumsum`` - with ``expanding``, use :meth:`~DataFrame.fillna`. + a ``NaN`` is encountered in the input. In order to match the output of ``cumsum`` + with ``expanding``, use :meth:`~DataFrame.fillna`: .. ipython:: python - sn = pd.Series([1,2,np.nan,3,np.nan,4]) - sn.expanding().sum() - sn.cumsum() - sn.cumsum().fillna(method='ffill') From 8b89ecee2037ed4110a22fb2fb531dc17fb76cd2 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Wed, 14 Dec 2016 11:08:04 -0500 Subject: [PATCH 172/183] BUG: Convert float freqstrs to ints at finer resolution (#14378) Passing `'0.5min'` as a frequency string should generate 30 second intervals, rather than five minute intervals. By recursively increasing resolution until one is found for which the frequency is an integer, this commit ensures that that's the case for resolutions from days to microseconds. Fixes #8419 --- doc/source/whatsnew/v0.19.1.txt | 2 +- doc/source/whatsnew/v0.20.0.txt | 3 + pandas/src/period.pyx | 28 +++--- pandas/tseries/frequencies.py | 105 +++++++++++++++++++---- pandas/tseries/tests/test_frequencies.py | 51 ++++++++++- pandas/tseries/tests/test_tslib.py | 12 +-- 6 files changed, 159 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index db5bd22393e64..545b4380d9b75 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -58,4 +58,4 @@ Bug Fixes - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` - is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file + is not scalar and ``values`` is not specified (:issue:`14380`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d0009efd2d994..5cc9d575521f3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -52,6 +52,9 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) + +- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) + - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack of sorting or an incorrect key. See :ref:`here ` diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 5565f25937394..2d92b9f192328 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2 cdef int64_t NPY_NAT = util.get_nat() -cdef int US_RESO = frequencies.US_RESO -cdef int MS_RESO = frequencies.MS_RESO -cdef int S_RESO = frequencies.S_RESO -cdef int T_RESO = frequencies.T_RESO -cdef int H_RESO = frequencies.H_RESO -cdef int D_RESO = frequencies.D_RESO +cdef int RESO_US = frequencies.RESO_US +cdef int RESO_MS = frequencies.RESO_MS +cdef int RESO_SEC = frequencies.RESO_SEC +cdef int RESO_MIN = frequencies.RESO_MIN +cdef int RESO_HR = frequencies.RESO_HR +cdef int RESO_DAY = frequencies.RESO_DAY cdef extern from "period_helper.h": ctypedef struct date_info: @@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - int reso = D_RESO, curr_reso + int reso = RESO_DAY, curr_reso if tz is not None: tz = maybe_get_tz(tz) @@ -535,20 +535,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef inline int _reso_stamp(pandas_datetimestruct *dts): if dts.us != 0: if dts.us % 1000 == 0: - return MS_RESO - return US_RESO + return RESO_MS + return RESO_US elif dts.sec != 0: - return S_RESO + return RESO_SEC elif dts.min != 0: - return T_RESO + return RESO_MIN elif dts.hour != 0: - return H_RESO - return D_RESO + return RESO_HR + return RESO_DAY cdef _reso_local(ndarray[int64_t] stamps, object tz): cdef: Py_ssize_t n = len(stamps) - int reso = D_RESO, curr_reso + int reso = RESO_DAY, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index ac094c1f545f3..e0c602bf5a037 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -38,32 +38,55 @@ class FreqGroup(object): FR_NS = 12000 -US_RESO = 0 -MS_RESO = 1 -S_RESO = 2 -T_RESO = 3 -H_RESO = 4 -D_RESO = 5 +RESO_NS = 0 +RESO_US = 1 +RESO_MS = 2 +RESO_SEC = 3 +RESO_MIN = 4 +RESO_HR = 5 +RESO_DAY = 6 class Resolution(object): - # defined in period.pyx - # note that these are different from freq codes - RESO_US = US_RESO - RESO_MS = MS_RESO - RESO_SEC = S_RESO - RESO_MIN = T_RESO - RESO_HR = H_RESO - RESO_DAY = D_RESO + RESO_US = RESO_US + RESO_MS = RESO_MS + RESO_SEC = RESO_SEC + RESO_MIN = RESO_MIN + RESO_HR = RESO_HR + RESO_DAY = RESO_DAY _reso_str_map = { + RESO_NS: 'nanosecond', RESO_US: 'microsecond', RESO_MS: 'millisecond', RESO_SEC: 'second', RESO_MIN: 'minute', RESO_HR: 'hour', - RESO_DAY: 'day'} + RESO_DAY: 'day' + } + + # factor to multiply a value by to convert it to the next finer grained + # resolution + _reso_mult_map = { + RESO_NS: None, + RESO_US: 1000, + RESO_MS: 1000, + RESO_SEC: 1000, + RESO_MIN: 60, + RESO_HR: 60, + RESO_DAY: 24 + } + + _reso_str_bump_map = { + 'D': 'H', + 'H': 'T', + 'T': 'S', + 'S': 'L', + 'L': 'U', + 'U': 'N', + 'N': None + } _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)]) @@ -160,6 +183,47 @@ def get_reso_from_freq(cls, freq): """ return cls.get_reso(cls.get_str_from_freq(freq)) + @classmethod + def get_stride_from_decimal(cls, value, freq): + """ + Convert freq with decimal stride into a higher freq with integer stride + + Parameters + ---------- + value : integer or float + freq : string + Frequency string + + Raises + ------ + ValueError + If the float cannot be converted to an integer at any resolution. + + Example + ------- + >>> Resolution.get_stride_from_decimal(1.5, 'T') + (90, 'S') + + >>> Resolution.get_stride_from_decimal(1.04, 'H') + (3744, 'S') + + >>> Resolution.get_stride_from_decimal(1, 'D') + (1, 'D') + """ + + if np.isclose(value % 1, 0): + return int(value), freq + else: + start_reso = cls.get_reso_from_freq(freq) + if start_reso == 0: + raise ValueError( + "Could not convert to integer offset at any resolution" + ) + + next_value = cls._reso_mult_map[start_reso] * value + next_name = cls._reso_str_bump_map[freq] + return cls.get_stride_from_decimal(next_value, next_name) + def get_to_timestamp_base(base): """ @@ -472,12 +536,17 @@ def to_offset(freq): splitted[2::4]): if sep != '' and not sep.isspace(): raise ValueError('separator must be spaces') - offset = get_offset(name) + prefix = _lite_rule_alias.get(name) or name if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 if not stride: stride = 1 + if prefix in Resolution._reso_str_bump_map.keys(): + stride, name = Resolution.get_stride_from_decimal( + float(stride), prefix + ) stride = int(stride) + offset = get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset @@ -493,7 +562,9 @@ def to_offset(freq): # hack to handle WOM-1MON -opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)') +opattern = re.compile( + r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' +) def _base_and_stride(freqstr): diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 5ba98f15aed8d..dfb7b26371d7a 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -39,6 +39,21 @@ def test_to_offset_multiple(self): expected = offsets.Hour(3) assert (result == expected) + freqstr = '2h 20.5min' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(8430) + assert (result == expected) + + freqstr = '1.5min' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(90) + assert (result == expected) + + freqstr = '0.5S' + result = frequencies.to_offset(freqstr) + expected = offsets.Milli(500) + assert (result == expected) + freqstr = '15l500u' result = frequencies.to_offset(freqstr) expected = offsets.Micro(15500) @@ -49,6 +64,16 @@ def test_to_offset_multiple(self): expected = offsets.Milli(10075) assert (result == expected) + freqstr = '1s0.25ms' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(1000250) + assert (result == expected) + + freqstr = '1s0.25L' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(1000250) + assert (result == expected) + freqstr = '2800N' result = frequencies.to_offset(freqstr) expected = offsets.Nano(2800) @@ -107,10 +132,8 @@ def test_to_offset_invalid(self): frequencies.to_offset('-2-3U') with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'): frequencies.to_offset('-2D:3H') - - # ToDo: Must be fixed in #8419 - with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'): - frequencies.to_offset('.5S') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 1.5.0S'): + frequencies.to_offset('1.5.0S') # split offsets with spaces are valid assert frequencies.to_offset('2D 3H') == offsets.Hour(51) @@ -379,6 +402,26 @@ def test_freq_to_reso(self): result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq))) self.assertEqual(freq, result) + def test_resolution_bumping(self): + # GH 14378 + Reso = frequencies.Resolution + + self.assertEqual(Reso.get_stride_from_decimal(1.5, 'T'), (90, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(62.4, 'T'), (3744, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(1.04, 'H'), (3744, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(1, 'D'), (1, 'D')) + self.assertEqual(Reso.get_stride_from_decimal(0.342931, 'H'), + (1234551600, 'U')) + self.assertEqual(Reso.get_stride_from_decimal(1.2345, 'D'), + (106660800, 'L')) + + with self.assertRaises(ValueError): + Reso.get_stride_from_decimal(0.5, 'N') + + # too much precision in the input can prevent + with self.assertRaises(ValueError): + Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H') + def test_get_freq_code(self): # freqstr self.assertEqual(frequencies.get_freq_code('A'), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index b45f867be65dd..58ec1561b2535 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -14,7 +14,7 @@ from pandas.tseries.index import date_range, DatetimeIndex from pandas.tseries.frequencies import ( get_freq, - US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO + RESO_US, RESO_MS, RESO_SEC, RESO_HR, RESO_DAY, RESO_MIN ) import pandas.tseries.tools as tools import pandas.tseries.offsets as offsets @@ -1528,11 +1528,11 @@ def test_resolution(self): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], - [D_RESO, D_RESO, - D_RESO, D_RESO, - H_RESO, T_RESO, - S_RESO, MS_RESO, - US_RESO]): + [RESO_DAY, RESO_DAY, + RESO_DAY, RESO_DAY, + RESO_HR, RESO_MIN, + RESO_SEC, RESO_MS, + RESO_US]): for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Eastern']: idx = date_range(start='2013-04-01', periods=30, freq=freq, From 84cad615564175119635abaea8b83b36a6550d7c Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Wed, 14 Dec 2016 11:10:42 -0500 Subject: [PATCH 173/183] ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) (#14531) --- doc/source/whatsnew/v0.19.2.txt | 9 ++ pandas/tools/merge.py | 152 +++++++++++++++++++++----- pandas/tools/tests/test_merge_asof.py | 90 +++++++++++++++ 3 files changed, 224 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 231297df3fb8f..dabc6036fc9ba 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -23,6 +23,15 @@ Performance Improvements - Improved performance of ``.replace()`` (:issue:`12745`) +.. _whatsnew_0192.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`) + + + .. _whatsnew_0192.bug_fixes: Bug Fixes diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 68953c90676dd..198991531e0a7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -259,7 +259,8 @@ def _merger(x, y): def merge_asof(left, right, on=None, left_on=None, right_on=None, - by=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, allow_exact_matches=True): @@ -288,9 +289,29 @@ def merge_asof(left, right, on=None, Field name to join on in left DataFrame. right_on : label Field name to join on in right DataFrame. + left_index : boolean + Use the index of the left DataFrame as the join key. + + .. versionadded:: 0.19.2 + + right_index : boolean + Use the index of the right DataFrame as the join key. + + .. versionadded:: 0.19.2 + by : column name Group both the left and right DataFrames by the group column; perform the merge operation on these pieces and recombine. + left_by : column name + Field name to group by in the left DataFrame. + + .. versionadded:: 0.19.2 + + right_by : column name + Field name to group by in the right DataFrame. + + .. versionadded:: 0.19.2 + suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively @@ -348,6 +369,28 @@ def merge_asof(left, right, on=None, 3 5 b 3.0 6 10 c 7.0 + We can use indexed DataFrames as well. + + >>> left + left_val + 1 a + 5 b + 10 c + + >>> right + right_val + 1 1 + 2 2 + 3 3 + 6 6 + 7 7 + + >>> pd.merge_asof(left, right, left_index=True, right_index=True) + left_val right_val + 1 a 1 + 5 b 3 + 10 c 7 + Here is a real-world times-series example >>> quotes @@ -418,7 +461,9 @@ def merge_asof(left, right, on=None, """ op = _AsOfMerge(left, right, on=on, left_on=left_on, right_on=right_on, - by=by, suffixes=suffixes, + left_index=left_index, right_index=right_index, + by=by, left_by=left_by, right_by=right_by, + suffixes=suffixes, how='asof', tolerance=tolerance, allow_exact_matches=allow_exact_matches) return op.get_result() @@ -650,7 +695,7 @@ def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] - if self.left_index and self.right_index: + if self.left_index and self.right_index and self.how != 'asof': join_index, left_indexer, right_indexer = \ left_ax.join(right_ax, how=self.how, return_indexers=True) elif self.right_index and self.how == 'left': @@ -731,6 +776,16 @@ def _get_merge_keys(self): is_rkey = lambda x: isinstance( x, (np.ndarray, ABCSeries)) and len(x) == len(right) + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A + # user could, for example, request 'left_index' and 'left_by'. In a + # regular pd.merge(), users cannot specify both 'left_index' and + # 'left_on'. (Instead, users have a MultiIndex). That means the + # self.left_on in this function is always empty in a pd.merge(), but + # a pd.merge_asof(left_index=True, left_by=...) will result in a + # self.left_on array with a None in the middle of it. This requires + # a work-around as designated in the code below. + # See _validate_specification() for where this happens. + # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): for lk, rk in zip(self.left_on, self.right_on): @@ -740,12 +795,21 @@ def _get_merge_keys(self): right_keys.append(rk) join_names.append(None) # what to do? else: - right_keys.append(right[rk]._values) - join_names.append(rk) + if rk is not None: + right_keys.append(right[rk]._values) + join_names.append(rk) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + join_names.append(right.index.name) else: if not is_rkey(rk): - right_keys.append(right[rk]._values) - if lk == rk: + if rk is not None: + right_keys.append(right[rk]._values) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + if lk is not None and lk == rk: # avoid key upcast in corner case (length-0) if len(left) > 0: right_drop.append(rk) @@ -753,8 +817,13 @@ def _get_merge_keys(self): left_drop.append(lk) else: right_keys.append(rk) - left_keys.append(left[lk]._values) - join_names.append(lk) + if lk is not None: + left_keys.append(left[lk]._values) + join_names.append(lk) + else: + # work-around for merge_asof(left_index=True) + left_keys.append(left.index) + join_names.append(left.index.name) elif _any(self.left_on): for k in self.left_on: if is_lkey(k): @@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' - def __init__(self, left, right, on=None, left_on=None, - right_on=None, axis=1, + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='outer'): self.fill_method = fill_method _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + left_index=left_index, + right_index=right_index, right_on=right_on, axis=axis, how=how, suffixes=suffixes, sort=True # factorize sorts @@ -958,19 +1029,23 @@ def _get_cython_type(dtype): class _AsOfMerge(_OrderedMerge): _merge_type = 'asof_merge' - def __init__(self, left, right, on=None, by=None, left_on=None, - right_on=None, axis=1, - suffixes=('_x', '_y'), copy=True, + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, + axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='asof', tolerance=None, allow_exact_matches=True): self.by = by + self.left_by = left_by + self.right_by = right_by self.tolerance = tolerance self.allow_exact_matches = allow_exact_matches _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, - right_on=right_on, axis=axis, + right_on=right_on, left_index=left_index, + right_index=right_index, axis=axis, how=how, suffixes=suffixes, fill_method=fill_method) @@ -978,23 +1053,44 @@ def _validate_specification(self): super(_AsOfMerge, self)._validate_specification() # we only allow on to be a single item for on - if len(self.left_on) != 1: + if len(self.left_on) != 1 and not self.left_index: raise MergeError("can only asof on a key for left") - if len(self.right_on) != 1: + if len(self.right_on) != 1 and not self.right_index: raise MergeError("can only asof on a key for right") + if self.left_index and isinstance(self.left.index, MultiIndex): + raise MergeError("left can only have one index") + + if self.right_index and isinstance(self.right.index, MultiIndex): + raise MergeError("right can only have one index") + + # set 'by' columns + if self.by is not None: + if self.left_by is not None or self.right_by is not None: + raise MergeError('Can only pass by OR left_by ' + 'and right_by') + self.left_by = self.right_by = self.by + if self.left_by is None and self.right_by is not None: + raise MergeError('missing left_by') + if self.left_by is not None and self.right_by is None: + raise MergeError('missing right_by') + # add by to our key-list so we can have it in the # output as a key - if self.by is not None: - if not is_list_like(self.by): - self.by = [self.by] + if self.left_by is not None: + if not is_list_like(self.left_by): + self.left_by = [self.left_by] + if not is_list_like(self.right_by): + self.right_by = [self.right_by] - if len(self.by) != 1: + if len(self.left_by) != 1: + raise MergeError("can only asof by a single key") + if len(self.right_by) != 1: raise MergeError("can only asof by a single key") - self.left_on = self.by + list(self.left_on) - self.right_on = self.by + list(self.right_on) + self.left_on = self.left_by + list(self.left_on) + self.right_on = self.right_by + list(self.right_on) @property def _asof_key(self): @@ -1017,7 +1113,7 @@ def _get_merge_keys(self): # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: - lt = left_join_keys[self.left_on.index(self._asof_key)] + lt = left_join_keys[-1] msg = "incompatible tolerance, must be compat " \ "with type {0}".format(type(lt)) @@ -1047,8 +1143,10 @@ def _get_join_indexers(self): """ return the join indexers """ # values to compare - left_values = self.left_join_keys[-1] - right_values = self.right_join_keys[-1] + left_values = (self.left.index.values if self.left_index else + self.left_join_keys[-1]) + right_values = (self.right.index.values if self.right_index else + self.right_join_keys[-1]) tolerance = self.tolerance # we required sortedness in the join keys @@ -1066,7 +1164,7 @@ def _get_join_indexers(self): tolerance = tolerance.value # a "by" parameter requires special handling - if self.by is not None: + if self.left_by is not None: left_by_values = self.left_join_keys[0] right_by_values = self.right_join_keys[0] diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index 5c8f424bde7a5..d33ba30d7f032 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -118,6 +118,96 @@ def test_basic_categorical(self): by='ticker') assert_frame_equal(result, expected) + def test_basic_left_index(self): + + # GH14253 + expected = self.asof + trades = self.trades.set_index('time') + quotes = self.quotes + + result = merge_asof(trades, quotes, + left_index=True, + right_on='time', + by='ticker') + # left-only index uses right's index, oddly + expected.index = result.index + # time column appears after left's columns + expected = expected[result.columns] + assert_frame_equal(result, expected) + + def test_basic_right_index(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes.set_index('time') + + result = merge_asof(trades, quotes, + left_on='time', + right_index=True, + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_left_index_right_index(self): + + expected = self.asof.set_index('time') + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + + result = merge_asof(trades, quotes, + left_index=True, + right_index=True, + by='ticker') + assert_frame_equal(result, expected) + + def test_multi_index(self): + + # MultiIndex is prohibited + trades = self.trades.set_index(['time', 'price']) + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_index=True, + right_index=True) + + trades = self.trades.set_index('time') + quotes = self.quotes.set_index(['time', 'bid']) + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_index=True, + right_index=True) + + def test_on_and_index(self): + + # 'on' parameter and index together is prohibited + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_on='price', + left_index=True, + right_index=True) + + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + right_on='bid', + left_index=True, + right_index=True) + + def test_basic_left_by_right_by(self): + + # GH14253 + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + left_by='ticker', + right_by='ticker') + assert_frame_equal(result, expected) + def test_missing_right_by(self): expected = self.asof From abdfa3ef6f3d2c228f5029d42e8802061e081bf1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Dec 2016 17:19:00 +0100 Subject: [PATCH 174/183] DOC: already include 0.20 release notes for dev docs --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 616e1f5c8efc7..d6fb1c6a8f9cc 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.20.0.txt + .. include:: whatsnew/v0.19.2.txt .. include:: whatsnew/v0.19.1.txt From a8cabb8ebb5875f2aa6e69c355e0ba2af06f7438 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 14 Dec 2016 15:39:39 -0500 Subject: [PATCH 175/183] ENH: Allow the groupby by param to handle columns and index levels (GH5677) (#14432) --- doc/source/groupby.rst | 83 ++++++++++++++--- doc/source/whatsnew/v0.20.0.txt | 14 +++ pandas/core/generic.py | 2 +- pandas/core/groupby.py | 16 +++- pandas/tests/test_groupby.py | 152 ++++++++++++++++++++++++++++++++ 5 files changed, 252 insertions(+), 15 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index f3fcd6901a440..45af02cb60b25 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -94,11 +94,21 @@ The mapping can be specified many different ways: - For DataFrame objects, a string indicating a column to be used to group. Of course ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``, but it makes life simpler + - For DataFrame objects, a string indicating an index level to be used to group. - A list of any of the above things Collectively we refer to the grouping objects as the **keys**. For example, consider the following DataFrame: +.. note:: + + .. versionadded:: 0.20 + + A string passed to ``groupby`` may refer to either a column or an index level. + If a string matches both a column name and an index level name then a warning is + issued and the column takes precedence. This will result in an ambiguity error + in a future version. + .. ipython:: python df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', @@ -237,17 +247,6 @@ the length of the ``groups`` dict, so it is largely just a convenience: gb.aggregate gb.count gb.cumprod gb.dtype gb.first gb.groups gb.hist gb.max gb.min gb.nth gb.prod gb.resample gb.sum gb.var gb.apply gb.cummax gb.cumsum gb.fillna gb.gender gb.head gb.indices gb.mean gb.name gb.ohlc gb.quantile gb.size gb.tail gb.weight - -.. ipython:: python - :suppress: - - df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : np.random.randn(8), - 'D' : np.random.randn(8)}) - .. _groupby.multiindex: GroupBy with MultiIndex @@ -289,7 +288,9 @@ chosen level: s.sum(level='second') -Also as of v0.6, grouping with multiple levels is supported. +.. versionadded:: 0.6 + +Grouping with multiple levels is supported. .. ipython:: python :suppress: @@ -306,8 +307,56 @@ Also as of v0.6, grouping with multiple levels is supported. s s.groupby(level=['first', 'second']).sum() +.. versionadded:: 0.20 + +Index level names may be supplied as keys. + +.. ipython:: python + + s.groupby(['first', 'second']).sum() + More on the ``sum`` function and aggregation later. +Grouping DataFrame with Index Levels and Columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A DataFrame may be grouped by a combination of columns and index levels by +specifying the column names as strings and the index levels as ``pd.Grouper`` +objects. + +.. ipython:: python + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + + index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + + df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], + 'B': np.arange(8)}, + index=index) + + df + +The following example groups ``df`` by the ``second`` index level and +the ``A`` column. + +.. ipython:: python + + df.groupby([pd.Grouper(level=1), 'A']).sum() + +Index levels may also be specified by name. + +.. ipython:: python + + df.groupby([pd.Grouper(level='second'), 'A']).sum() + +.. versionadded:: 0.20 + +Index level names may be specified as keys directly to ``groupby``. + +.. ipython:: python + + df.groupby(['second', 'A']).sum() + DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -315,6 +364,16 @@ Once you have created the GroupBy object from a DataFrame, for example, you might want to do something different for each of the columns. Thus, using ``[]`` similar to getting a column from a DataFrame, you can do: +.. ipython:: python + :suppress: + + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) + .. ipython:: python grouped = df.groupby(['A']) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5cc9d575521f3..6ee97c555f5ed 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -51,6 +51,20 @@ Other enhancements - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) + +.. ipython:: python + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + + index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + + df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], + 'B': np.arange(8)}, + index=index) + + df.groupby(['second', 'A']).sum() - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3a352e352441b..48d799811aa94 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4007,7 +4007,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Parameters ---------- by : mapping function / list of functions, dict, Series, or tuple / - list of column names. + list of column names or index level names. Called on each element of the object index to determine the groups. If a dict or Series is passed, the Series or dict VALUES will be used to determine the groups diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 66c9e38766989..b249cded39133 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2459,8 +2459,20 @@ def is_in_obj(gpr): exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') - in_axis, name, gpr = True, gpr, obj[gpr] - exclusions.append(name) + if gpr in obj: + if gpr in obj.index.names: + warnings.warn( + ("'%s' is both a column name and an index level.\n" + "Defaulting to column but " + "this will raise an ambiguity error in a " + "future version") % gpr, + FutureWarning, stacklevel=2) + in_axis, name, gpr = True, gpr, obj[gpr] + exclusions.append(name) + elif gpr in obj.index.names: + in_axis, name, level, gpr = False, None, gpr, None + else: + raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a2e1c5e9ff2e8..7b98a45395752 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -521,6 +521,158 @@ def test_grouper_column_and_index(self): expected = df_single.reset_index().groupby(['inner', 'B']).mean() assert_frame_equal(result, expected) + def test_grouper_index_level_as_string(self): + # GH 5677, allow strings passed as the `by` parameter to reference + # columns or index levels + + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + + df_single = df_multi.reset_index('outer') + + # Column and Index on MultiIndex + result = df_multi.groupby(['B', 'inner']).mean() + expected = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() + assert_frame_equal(result, expected) + + # Index and Column on MultiIndex + result = df_multi.groupby(['inner', 'B']).mean() + expected = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() + assert_frame_equal(result, expected) + + # Column and Index on single Index + result = df_single.groupby(['B', 'inner']).mean() + expected = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + assert_frame_equal(result, expected) + + # Index and Column on single Index + result = df_single.groupby(['inner', 'B']).mean() + expected = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() + assert_frame_equal(result, expected) + + # Single element list of Index on MultiIndex + result = df_multi.groupby(['inner']).mean() + expected = df_multi.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + # Single element list of Index on single Index + result = df_single.groupby(['inner']).mean() + expected = df_single.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + # Index on MultiIndex + result = df_multi.groupby('inner').mean() + expected = df_multi.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + # Index on single Index + result = df_single.groupby('inner').mean() + expected = df_single.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + def test_grouper_column_index_level_precedence(self): + # GH 5677, when a string passed as the `by` parameter + # matches a column and an index level the column takes + # precedence + + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi_both = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one'], + 'inner': [1, 1, 1, 1, 1, 1]}, + index=idx) + + df_single_both = df_multi_both.reset_index('outer') + + # Group MultiIndex by single key + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi_both.groupby('inner').mean() + + expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group single Index by single key + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_single_both.groupby('inner').mean() + + expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group MultiIndex by single key list + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi_both.groupby(['inner']).mean() + + expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group single Index by single key list + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_single_both.groupby(['inner']).mean() + + expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group MultiIndex by two keys (1) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi_both.groupby(['B', 'inner']).mean() + + expected = df_multi_both.groupby(['B', + pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_multi_both.groupby(['B', + pd.Grouper(level='inner') + ]).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group MultiIndex by two keys (2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi_both.groupby(['inner', 'B']).mean() + + expected = df_multi_both.groupby([pd.Grouper(key='inner'), + 'B']).mean() + assert_frame_equal(result, expected) + not_expected = df_multi_both.groupby([pd.Grouper(level='inner'), + 'B']).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group single Index by two keys (1) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_single_both.groupby(['B', 'inner']).mean() + + expected = df_single_both.groupby(['B', + pd.Grouper(key='inner')]).mean() + assert_frame_equal(result, expected) + not_expected = df_single_both.groupby(['B', + pd.Grouper(level='inner') + ]).mean() + self.assertFalse(result.index.equals(not_expected.index)) + + # Group single Index by two keys (2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_single_both.groupby(['inner', 'B']).mean() + + expected = df_single_both.groupby([pd.Grouper(key='inner'), + 'B']).mean() + assert_frame_equal(result, expected) + not_expected = df_single_both.groupby([pd.Grouper(level='inner'), + 'B']).mean() + self.assertFalse(result.index.equals(not_expected.index)) + def test_grouper_getting_correct_binner(self): # GH 10063 From 5f889a2106f6584583458e01dbd0f3b9b696fab2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 15 Dec 2016 01:02:51 -0800 Subject: [PATCH 176/183] TST: Same return values in drop_duplicates for Series and DataFrames(#14192) (#14754) --- pandas/tests/test_base.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index da8cf120b8ed4..a5cd0bbc28369 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -949,6 +949,21 @@ def test_duplicated_drop_duplicates_index(self): s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) + def test_drop_duplicates_series_vs_dataframe(self): + # GH 14192 + df = pd.DataFrame({'a': [1, 1, 1, 'one', 'one'], + 'b': [2, 2, np.nan, np.nan, np.nan], + 'c': [3, 3, np.nan, np.nan, 'three'], + 'd': [1, 2, 3, 4, 4], + 'e': [datetime(2015, 1, 1), datetime(2015, 1, 1), + datetime(2015, 2, 1), pd.NaT, pd.NaT] + }) + for column in df.columns: + for keep in ['first', 'last', False]: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) + def test_fillna(self): # # GH 11343 # though Index.fillna and Series.fillna has separate impl, From 358bc6f8967d56e0cf914f0890b6d11864f2cc47 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 15 Dec 2016 06:20:59 -0500 Subject: [PATCH 177/183] MAINT: Style check *.c and *.h files `cpplint` was introduced #14740, and this commit extends to check other `*.c` and `*.h` files. Currently, they all reside in `pandas/src`, and this commit expands the lint to check all of the following: 1) `datetime` (dir) 2) `ujson` (dir) 3) `period_helper.c` 4) `All header files` The parser directory was handled in #14740, and the others have been deliberately omitted per the discussion here. Author: gfyoung Closes #14814 from gfyoung/c-style-continue and squashes the following commits: 27d4d46 [gfyoung] MAINT: Style check *.c and *.h files --- ci/lint.sh | 17 +- pandas/src/datetime/np_datetime.c | 437 +-- pandas/src/datetime/np_datetime.h | 62 +- pandas/src/datetime/np_datetime_strings.c | 470 +-- pandas/src/datetime/np_datetime_strings.h | 27 +- pandas/src/datetime_helper.h | 19 +- pandas/src/helper.h | 17 +- pandas/src/numpy_helper.h | 194 +- pandas/src/parse_helper.h | 278 +- pandas/src/period_helper.c | 958 ++--- pandas/src/period_helper.h | 183 +- pandas/src/skiplist.h | 436 ++- pandas/src/ujson/lib/ultrajson.h | 54 +- pandas/src/ujson/lib/ultrajsondec.c | 1756 +++++---- pandas/src/ujson/lib/ultrajsonenc.c | 1743 +++++---- pandas/src/ujson/python/JSONtoObj.c | 1036 +++-- pandas/src/ujson/python/objToJSON.c | 4191 +++++++++------------ pandas/src/ujson/python/py_defines.h | 5 + pandas/src/ujson/python/ujson.c | 92 +- pandas/src/ujson/python/version.h | 5 + 20 files changed, 5948 insertions(+), 6032 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 7ab97bfc6d328..d7df6215450b4 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -38,13 +38,22 @@ if [ "$LINT" ]; then # readability/casting: Warnings about C casting instead of C++ casting # runtime/int: Warnings about using C number types instead of C++ ones # build/include_subdir: Warnings about prefacing included header files with directory + + # We don't lint all C files because we don't want to lint any that are built + # from Cython files nor do we want to lint C files that we didn't modify for + # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, + # we can lint all header files since they aren't "generated" like C files are. pip install cpplint echo "Linting *.c and *.h" - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/parser - if [ $? -ne "0" ]; then - RET=1 - fi + for path in '*.h' 'period_helper.c' 'datetime' 'parser' 'ujson' + do + echo "linting -> pandas/src/$path" + cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/$path + if [ $? -ne "0" ]; then + RET=1 + fi + done echo "Linting *.c and *.h DONE" echo "Check for invalid testing" diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index d4b9de45618f3..8458418988863 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -1,65 +1,65 @@ /* - * This is derived from Numpy 1.7 - * - * See NP_LICENSE.txt - */ + +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +Copyright (c) 2005-2011, NumPy Developers +All rights reserved. + +This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt + +*/ #define NO_IMPORT #include #include -/* #define __MSVCRT_VERSION__ 0x0700 /\* whatever above 0x0601 *\/ */ -/* #include */ -/* #define time_t __time64_t */ -/* #define localtime _localtime64 */ -/* #define time _time64 */ - #include #include #include "np_datetime.h" #if PY_MAJOR_VERSION >= 3 - #define PyIntObject PyLongObject - #define PyInt_Type PyLong_Type - #define PyInt_Check(op) PyLong_Check(op) - #define PyInt_CheckExact(op) PyLong_CheckExact(op) - #define PyInt_FromString PyLong_FromString - #define PyInt_FromUnicode PyLong_FromUnicode - #define PyInt_FromLong PyLong_FromLong - #define PyInt_FromSize_t PyLong_FromSize_t - #define PyInt_FromSsize_t PyLong_FromSsize_t - #define PyInt_AsLong PyLong_AsLong - #define PyInt_AS_LONG PyLong_AS_LONG - #define PyInt_AsSsize_t PyLong_AsSsize_t - #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask - #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask +#define PyIntObject PyLongObject +#define PyInt_Type PyLong_Type +#define PyInt_Check(op) PyLong_Check(op) +#define PyInt_CheckExact(op) PyLong_CheckExact(op) +#define PyInt_FromString PyLong_FromString +#define PyInt_FromUnicode PyLong_FromUnicode +#define PyInt_FromLong PyLong_FromLong +#define PyInt_FromSize_t PyLong_FromSize_t +#define PyInt_FromSsize_t PyLong_FromSsize_t +#define PyInt_AsLong PyLong_AsLong +#define PyInt_AS_LONG PyLong_AS_LONG +#define PyInt_AsSsize_t PyLong_AsSsize_t +#define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask +#define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask #endif const int days_per_month_table[2][12] = { - { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, - { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } -}; + {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; /* * Returns 1 if the given year is a leap year, 0 otherwise. */ -int is_leapyear(npy_int64 year) -{ +int is_leapyear(npy_int64 year) { return (year & 0x3) == 0 && /* year % 4 == 0 */ - ((year % 100) != 0 || - (year % 400) == 0); + ((year % 100) != 0 || (year % 400) == 0); } /* * Sakamoto's method, from wikipedia */ -int dayofweek(int y, int m, int d) -{ +int dayofweek(int y, int m, int d) { int day; static const int t[] = {0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4}; y -= m < 3; - day = (y + y/4 - y/100 + y/400 + t[m-1] + d) % 7; + day = (y + y / 4 - y / 100 + y / 400 + t[m - 1] + d) % 7; // convert to python day return (day + 6) % 7; } @@ -68,9 +68,7 @@ int dayofweek(int y, int m, int d) * Adjusts a datetimestruct based on a minutes offset. Assumes * the current values are valid.g */ -void -add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) -{ +void add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) { int isleap; /* MINUTES */ @@ -102,12 +100,11 @@ add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) dts->month = 12; } isleap = is_leapyear(dts->year); - dts->day += days_per_month_table[isleap][dts->month-1]; - } - else if (dts->day > 28) { + dts->day += days_per_month_table[isleap][dts->month - 1]; + } else if (dts->day > 28) { isleap = is_leapyear(dts->year); - if (dts->day > days_per_month_table[isleap][dts->month-1]) { - dts->day -= days_per_month_table[isleap][dts->month-1]; + if (dts->day > days_per_month_table[isleap][dts->month - 1]) { + dts->day -= days_per_month_table[isleap][dts->month - 1]; dts->month++; if (dts->month > 12) { dts->year++; @@ -120,9 +117,7 @@ add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) /* * Calculates the days offset from the 1970 epoch. */ -npy_int64 -get_datetimestruct_days(const pandas_datetimestruct *dts) -{ +npy_int64 get_datetimestruct_days(const pandas_datetimestruct *dts) { int i, month; npy_int64 year, days = 0; const int *month_lengths; @@ -147,8 +142,7 @@ get_datetimestruct_days(const pandas_datetimestruct *dts) year += 300; /* Add one day for each 400 years */ days += year / 400; - } - else { + } else { /* * 1972 is the closest later year after 1970. * Include the current year, so subtract 2. @@ -183,20 +177,17 @@ get_datetimestruct_days(const pandas_datetimestruct *dts) * Modifies '*days_' to be the day offset within the year, * and returns the year. */ -static npy_int64 -days_to_yearsdays(npy_int64 *days_) -{ - const npy_int64 days_per_400years = (400*365 + 100 - 4 + 1); +static npy_int64 days_to_yearsdays(npy_int64 *days_) { + const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); /* Adjust so it's relative to the year 2000 (divisible by 400) */ - npy_int64 days = (*days_) - (365*30 + 7); + npy_int64 days = (*days_) - (365 * 30 + 7); npy_int64 year; /* Break down the 400 year cycle to get the year and day within the year */ if (days >= 0) { year = 400 * (days / days_per_400years); days = days % days_per_400years; - } - else { + } else { year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); days = days % days_per_400years; if (days < 0) { @@ -206,14 +197,14 @@ days_to_yearsdays(npy_int64 *days_) /* Work out the year/day within the 400 year cycle */ if (days >= 366) { - year += 100 * ((days-1) / (100*365 + 25 - 1)); - days = (days-1) % (100*365 + 25 - 1); + year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); + days = (days - 1) % (100 * 365 + 25 - 1); if (days >= 365) { - year += 4 * ((days+1) / (4*365 + 1)); - days = (days+1) % (4*365 + 1); + year += 4 * ((days + 1) / (4 * 365 + 1)); + days = (days + 1) % (4 * 365 + 1); if (days >= 366) { - year += (days-1) / 365; - days = (days-1) % 365; + year += (days - 1) / 365; + days = (days - 1) % 365; } } } @@ -226,9 +217,8 @@ days_to_yearsdays(npy_int64 *days_) * Adjusts a datetimestruct based on a seconds offset. Assumes * the current values are valid. */ -NPY_NO_EXPORT void -add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) -{ +NPY_NO_EXPORT void add_seconds_to_datetimestruct(pandas_datetimestruct *dts, + int seconds) { int minutes; dts->sec += seconds; @@ -240,8 +230,7 @@ add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) dts->sec += 60; } add_minutes_to_datetimestruct(dts, minutes); - } - else if (dts->sec >= 60) { + } else if (dts->sec >= 60) { minutes = dts->sec / 60; dts->sec = dts->sec % 60; add_minutes_to_datetimestruct(dts, minutes); @@ -252,9 +241,8 @@ add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) * Fills in the year, month, day in 'dts' based on the days * offset from 1970. */ -static void -set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) -{ +static void set_datetimestruct_days(npy_int64 days, + pandas_datetimestruct *dts) { const int *month_lengths; int i; @@ -266,8 +254,7 @@ set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) dts->month = i + 1; dts->day = days + 1; return; - } - else { + } else { days -= month_lengths[i]; } } @@ -276,9 +263,8 @@ set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) /* * Compares two pandas_datetimestruct objects chronologically */ -int -cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b) -{ +int cmp_pandas_datetimestruct(pandas_datetimestruct *a, + pandas_datetimestruct *b) { if (a->year > b->year) { return 1; } else if (a->year < b->year) { @@ -355,11 +341,10 @@ cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b) * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the neeeded date or datetime attributes. */ -int -convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo) -{ +int convert_pydatetime_to_datetimestruct(PyObject *obj, + pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo) { PyObject *tmp; int isleap; @@ -370,8 +355,8 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, /* Need at least year/month/day attributes */ if (!PyObject_HasAttrString(obj, "year") || - !PyObject_HasAttrString(obj, "month") || - !PyObject_HasAttrString(obj, "day")) { + !PyObject_HasAttrString(obj, "month") || + !PyObject_HasAttrString(obj, "day")) { return 1; } @@ -417,15 +402,15 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, } isleap = is_leapyear(out->year); if (out->day < 1 || - out->day > days_per_month_table[isleap][out->month-1]) { + out->day > days_per_month_table[isleap][out->month - 1]) { goto invalid_date; } /* Check for time attributes (if not there, return success as a date) */ if (!PyObject_HasAttrString(obj, "hour") || - !PyObject_HasAttrString(obj, "minute") || - !PyObject_HasAttrString(obj, "second") || - !PyObject_HasAttrString(obj, "microsecond")) { + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { /* The best unit for date is 'D' */ if (out_bestunit != NULL) { *out_bestunit = PANDAS_FR_D; @@ -481,10 +466,8 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, } Py_DECREF(tmp); - if (out->hour < 0 || out->hour >= 24 || - out->min < 0 || out->min >= 60 || - out->sec < 0 || out->sec >= 60 || - out->us < 0 || out->us >= 1000000) { + if (out->hour < 0 || out->hour >= 24 || out->min < 0 || out->min >= 60 || + out->sec < 0 || out->sec >= 60 || out->us < 0 || out->us >= 1000000) { goto invalid_time; } @@ -496,8 +479,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, } if (tmp == Py_None) { Py_DECREF(tmp); - } - else { + } else { PyObject *offset; int seconds_offset, minutes_offset; @@ -540,20 +522,20 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, invalid_date: PyErr_Format(PyExc_ValueError, - "Invalid date (%d,%d,%d) when converting to NumPy datetime", - (int)out->year, (int)out->month, (int)out->day); + "Invalid date (%d,%d,%d) when converting to NumPy datetime", + (int)out->year, (int)out->month, (int)out->day); return -1; invalid_time: PyErr_Format(PyExc_ValueError, - "Invalid time (%d,%d,%d,%d) when converting " - "to NumPy datetime", - (int)out->hour, (int)out->min, (int)out->sec, (int)out->us); + "Invalid time (%d,%d,%d,%d) when converting " + "to NumPy datetime", + (int)out->hour, (int)out->min, (int)out->sec, (int)out->us); return -1; } -npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) -{ +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d) { pandas_datetime_metadata meta; npy_datetime result = PANDAS_DATETIME_NAT; @@ -565,8 +547,7 @@ npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_da } void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, - pandas_datetimestruct *result) -{ + pandas_datetimestruct *result) { pandas_datetime_metadata meta; meta.base = fr; @@ -576,10 +557,9 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, } PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *) obj)->obmeta.base; + return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *)obj)->obmeta.base; } - /* * Converts a datetime from a datetimestruct to a datetime based * on some metadata. The date is assumed to be valid. @@ -588,23 +568,19 @@ PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { * * Returns 0 on success, -1 on failure. */ -int -convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, - const pandas_datetimestruct *dts, - npy_datetime *out) -{ +int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, + npy_datetime *out) { npy_datetime ret; PANDAS_DATETIMEUNIT base = meta->base; if (base == PANDAS_FR_Y) { /* Truncate to the year */ ret = dts->year - 1970; - } - else if (base == PANDAS_FR_M) { + } else if (base == PANDAS_FR_M) { /* Truncate to the month */ ret = 12 * (dts->year - 1970) + (dts->month - 1); - } - else { + } else { /* Otherwise calculate the number of days to start */ npy_int64 days = get_datetimestruct_days(dts); @@ -613,8 +589,7 @@ convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, /* Truncate to weeks */ if (days >= 0) { ret = days / 7; - } - else { + } else { ret = (days - 6) / 7; } break; @@ -622,74 +597,69 @@ convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, ret = days; break; case PANDAS_FR_h: - ret = days * 24 + - dts->hour; + ret = days * 24 + dts->hour; break; case PANDAS_FR_m: - ret = (days * 24 + - dts->hour) * 60 + - dts->min; + ret = (days * 24 + dts->hour) * 60 + dts->min; break; case PANDAS_FR_s: - ret = ((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec; + ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; break; case PANDAS_FR_ms: - ret = (((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000 + + ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000 + dts->us / 1000; break; case PANDAS_FR_us: - ret = (((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000000 + + ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000000 + dts->us; break; case PANDAS_FR_ns: - ret = ((((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000000 + - dts->us) * 1000 + + ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000000 + + dts->us) * + 1000 + dts->ps / 1000; break; case PANDAS_FR_ps: - ret = ((((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000000 + - dts->us) * 1000000 + + ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000000 + + dts->us) * + 1000000 + dts->ps; break; case PANDAS_FR_fs: /* only 2.6 hours */ - ret = (((((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000000 + - dts->us) * 1000000 + - dts->ps) * 1000 + + ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000000 + + dts->us) * + 1000000 + + dts->ps) * + 1000 + dts->as / 1000; break; case PANDAS_FR_as: /* only 9.2 secs */ - ret = (((((days * 24 + - dts->hour) * 60 + - dts->min) * 60 + - dts->sec) * 1000000 + - dts->us) * 1000000 + - dts->ps) * 1000000 + + ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + + dts->sec) * + 1000000 + + dts->us) * + 1000000 + + dts->ps) * + 1000000 + dts->as; break; default: /* Something got corrupted */ - PyErr_SetString(PyExc_ValueError, - "NumPy datetime metadata with corrupt unit value"); + PyErr_SetString( + PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); return -1; } } @@ -698,8 +668,7 @@ convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, if (meta->num > 1) { if (ret >= 0) { ret /= meta->num; - } - else { + } else { ret = (ret - meta->num + 1) / meta->num; } } @@ -709,18 +678,15 @@ convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, return 0; } - /* * This provides the casting rules for the TIMEDELTA data type units. * * Notably, there is a barrier between the nonlinear years and * months units, and all the other units. */ -npy_bool -can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) -{ +npy_bool can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) { switch (casting) { /* Allow anything with unsafe casting */ case NPY_UNSAFE_CASTING: @@ -732,7 +698,7 @@ can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, */ case NPY_SAME_KIND_CASTING: return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || - (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); /* * Enforce the 'date units' vs 'time units' barrier and that @@ -741,7 +707,7 @@ can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, */ case NPY_SAFE_CASTING: return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); /* Enforce equality with 'no' or 'equiv' casting */ @@ -756,11 +722,9 @@ can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, * Notably, there is a barrier between 'date units' and 'time units' * for all but 'unsafe' casting. */ -npy_bool -can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) -{ +npy_bool can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) { switch (casting) { /* Allow anything with unsafe casting */ case NPY_UNSAFE_CASTING: @@ -772,7 +736,7 @@ can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, */ case NPY_SAME_KIND_CASTING: return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || - (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); /* * Enforce the 'date units' vs 'time units' barrier and that @@ -781,7 +745,7 @@ can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, */ case NPY_SAFE_CASTING: return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); /* Enforce equality with 'no' or 'equiv' casting */ @@ -793,11 +757,9 @@ can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, /* * Converts a datetime based on the given metadata into a datetimestruct */ -int -convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, - npy_datetime dt, - pandas_datetimestruct *out) -{ +int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out) { npy_int64 perday; /* Initialize the output to all zeros */ @@ -820,12 +782,11 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, case PANDAS_FR_M: if (dt >= 0) { - out->year = 1970 + dt / 12; + out->year = 1970 + dt / 12; out->month = dt % 12 + 1; - } - else { - out->year = 1969 + (dt + 1) / 12; - out->month = 12 + (dt + 1)% 12; + } else { + out->year = 1969 + (dt + 1) / 12; + out->month = 12 + (dt + 1) % 12; } break; @@ -843,12 +804,11 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } out->hour = dt; break; @@ -858,12 +818,11 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } out->hour = dt / 60; out->min = dt % 60; @@ -874,14 +833,13 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; - } - out->hour = dt / (60*60); + out->hour = dt / (60 * 60); out->min = (dt / 60) % 60; out->sec = dt % 60; break; @@ -891,15 +849,14 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } - out->hour = dt / (60*60*1000LL); - out->min = (dt / (60*1000LL)) % 60; + out->hour = dt / (60 * 60 * 1000LL); + out->min = (dt / (60 * 1000LL)) % 60; out->sec = (dt / 1000LL) % 60; out->us = (dt % 1000LL) * 1000; break; @@ -909,15 +866,14 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; - } - out->hour = dt / (60*60*1000000LL); - out->min = (dt / (60*1000000LL)) % 60; + out->hour = dt / (60 * 60 * 1000000LL); + out->min = (dt / (60 * 1000000LL)) % 60; out->sec = (dt / 1000000LL) % 60; out->us = dt % 1000000LL; break; @@ -927,15 +883,14 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; - } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } - out->hour = dt / (60*60*1000000000LL); - out->min = (dt / (60*1000000000LL)) % 60; + out->hour = dt / (60 * 60 * 1000000000LL); + out->min = (dt / (60 * 1000000000LL)) % 60; out->sec = (dt / 1000000000LL) % 60; out->us = (dt / 1000LL) % 1000000LL; out->ps = (dt % 1000LL) * 1000; @@ -946,15 +901,14 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, if (dt >= 0) { set_datetimestruct_days(dt / perday, out); - dt = dt % perday; + dt = dt % perday; + } else { + set_datetimestruct_days( + dt / perday - (dt % perday == 0 ? 0 : 1), out); + dt = (perday - 1) + (dt + 1) % perday; } - else { - set_datetimestruct_days(dt / perday - (dt % perday == 0 ? 0 : 1), - out); - dt = (perday-1) + (dt + 1) % perday; - } - out->hour = dt / (60*60*1000000000000LL); - out->min = (dt / (60*1000000000000LL)) % 60; + out->hour = dt / (60 * 60 * 1000000000000LL); + out->min = (dt / (60 * 1000000000000LL)) % 60; out->sec = (dt / 1000000000000LL) % 60; out->us = (dt / 1000000LL) % 1000000LL; out->ps = dt % 1000000LL; @@ -963,20 +917,19 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, case PANDAS_FR_fs: /* entire range is only +- 2.6 hours */ if (dt >= 0) { - out->hour = dt / (60*60*1000000000000000LL); - out->min = (dt / (60*1000000000000000LL)) % 60; + out->hour = dt / (60 * 60 * 1000000000000000LL); + out->min = (dt / (60 * 1000000000000000LL)) % 60; out->sec = (dt / 1000000000000000LL) % 60; out->us = (dt / 1000000000LL) % 1000000LL; out->ps = (dt / 1000LL) % 1000000LL; out->as = (dt % 1000LL) * 1000; - } - else { + } else { npy_datetime minutes; - minutes = dt / (60*1000000000000000LL); - dt = dt % (60*1000000000000000LL); + minutes = dt / (60 * 1000000000000000LL); + dt = dt % (60 * 1000000000000000LL); if (dt < 0) { - dt += (60*1000000000000000LL); + dt += (60 * 1000000000000000LL); --minutes; } /* Offset the negative minutes */ @@ -995,8 +948,7 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, out->us = (dt / 1000000000000LL) % 1000000LL; out->ps = (dt / 1000000LL) % 1000000LL; out->as = dt % 1000000LL; - } - else { + } else { npy_datetime seconds; seconds = dt / 1000000000000000000LL; @@ -1015,11 +967,10 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, default: PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); + "NumPy datetime metadata is corrupted with invalid " + "base unit"); return -1; } return 0; } - diff --git a/pandas/src/datetime/np_datetime.h b/pandas/src/datetime/np_datetime.h index f200d3a289c06..3445fc3e48376 100644 --- a/pandas/src/datetime/np_datetime.h +++ b/pandas/src/datetime/np_datetime.h @@ -1,29 +1,41 @@ /* - * This is derived from numpy 1.7 - * See NP_LICENSE.TXT - */ -#ifndef _PANDAS_DATETIME_H_ -#define _PANDAS_DATETIME_H_ +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +Copyright (c) 2005-2011, NumPy Developers +All rights reserved. + +This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt + +*/ + +#ifndef PANDAS_SRC_DATETIME_NP_DATETIME_H_ +#define PANDAS_SRC_DATETIME_NP_DATETIME_H_ #include typedef enum { - PANDAS_FR_Y = 0, /* Years */ - PANDAS_FR_M = 1, /* Months */ - PANDAS_FR_W = 2, /* Weeks */ - /* Gap where NPY_FR_B was */ - PANDAS_FR_D = 4, /* Days */ - PANDAS_FR_h = 5, /* hours */ - PANDAS_FR_m = 6, /* minutes */ - PANDAS_FR_s = 7, /* seconds */ - PANDAS_FR_ms = 8,/* milliseconds */ - PANDAS_FR_us = 9,/* microseconds */ - PANDAS_FR_ns = 10,/* nanoseconds */ - PANDAS_FR_ps = 11,/* picoseconds */ - PANDAS_FR_fs = 12,/* femtoseconds */ - PANDAS_FR_as = 13,/* attoseconds */ - PANDAS_FR_GENERIC = 14 /* Generic, unbound units, can convert to anything */ + PANDAS_FR_Y = 0, // Years + PANDAS_FR_M = 1, // Months + PANDAS_FR_W = 2, // Weeks + // Gap where NPY_FR_B was + PANDAS_FR_D = 4, // Days + PANDAS_FR_h = 5, // hours + PANDAS_FR_m = 6, // minutes + PANDAS_FR_s = 7, // seconds + PANDAS_FR_ms = 8, // milliseconds + PANDAS_FR_us = 9, // microseconds + PANDAS_FR_ns = 10, // nanoseconds + PANDAS_FR_ps = 11, // picoseconds + PANDAS_FR_fs = 12, // femtoseconds + PANDAS_FR_as = 13, // attoseconds + PANDAS_FR_GENERIC = 14 // Generic, unbound units, can + // convert to anything } PANDAS_DATETIMEUNIT; #define PANDAS_DATETIME_NUMUNITS 13 @@ -45,7 +57,8 @@ typedef struct { // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, +int convert_pydatetime_to_datetimestruct(PyObject *obj, + pandas_datetimestruct *out, PANDAS_DATETIMEUNIT *out_bestunit, int apply_tzinfo); @@ -96,11 +109,6 @@ add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); * Notably, there is a barrier between the nonlinear years and * months units, and all the other units. */ -//npy_bool -//can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, -// PANDAS_DATETIMEUNIT dst_unit, -// NPY_CASTING casting); - npy_bool can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, PANDAS_DATETIMEUNIT dst_unit, @@ -116,4 +124,4 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); -#endif +#endif // PANDAS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index b633d6cde0820..5307d394423ff 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -1,11 +1,23 @@ /* - * This file implements string parsing and creation for NumPy datetime. - * - * Written by Mark Wiebe (mwwiebe@gmail.com) - * Copyright (c) 2011 by Enthought, Inc. - * - * See NP_LICENSE.txt for the license. - */ + +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +Written by Mark Wiebe (mwwiebe@gmail.com) +Copyright (c) 2011 by Enthought, Inc. + +Copyright (c) 2005-2011, NumPy Developers +All rights reserved. + +See NUMPY_LICENSE.txt for the license. + +This file implements string parsing and creation for NumPy datetime. + +*/ #define PY_SSIZE_T_CLEAN #define NO_IMPORT @@ -20,9 +32,7 @@ #include "np_datetime.h" #include "np_datetime_strings.h" -NPY_NO_EXPORT const char * -npy_casting_to_string(NPY_CASTING casting) -{ +NPY_NO_EXPORT const char *npy_casting_to_string(NPY_CASTING casting) { switch (casting) { case NPY_NO_CASTING: return "'no'"; @@ -42,35 +52,23 @@ npy_casting_to_string(NPY_CASTING casting) /* Platform-specific time_t typedef */ typedef time_t NPY_TIME_T; -/*// We *do* want these symbols, but for cython, not for C. fine in mac osx,*/ -/*// linux complains.*/ -/*static void _suppress_unused_variable_warning(void)*/ -/*{*/ -/* int x = days_per_month_table[0][0];*/ -/* x = x;*/ +/* We *do* want these symbols, but for Cython, not for C. + Fine in Mac OSX, but Linux complains. + +static void _suppress_unused_variable_warning(void) { + int x = days_per_month_table[0][0]; + x = x; -/* int y = _month_offset[0][0];*/ -/* y = y;*/ + int y = _month_offset[0][0]; + y = y; -/* char *z = _datetime_strings[0];*/ -/* z = z;*/ -/*}*/ + char *z = _datetime_strings[0]; + z = z; +} */ /* Exported as DATETIMEUNITS in multiarraymodule.c */ static char *_datetime_strings[PANDAS_DATETIME_NUMUNITS] = { - "Y", - "M", - "W", - "D", - "h", - "m", - "s", - "ms", - "us", - "ns", - "ps", - "fs", - "as", + "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "ns", "ps", "fs", "as", }; /* * Wraps `localtime` functionality for multiple platforms. This @@ -78,30 +76,28 @@ static char *_datetime_strings[PANDAS_DATETIME_NUMUNITS] = { * * Returns 0 on success, -1 on failure. */ -static int -get_localtime(NPY_TIME_T *ts, struct tm *tms) -{ +static int get_localtime(NPY_TIME_T *ts, struct tm *tms) { char *func_name = ""; #if defined(_WIN32) - #if defined(_MSC_VER) && (_MSC_VER >= 1400) +#if defined(_MSC_VER) && (_MSC_VER >= 1400) if (localtime_s(tms, ts) != 0) { func_name = "localtime_s"; goto fail; } - #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) +#elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) if (_localtime64_s(tms, ts) != 0) { func_name = "_localtime64_s"; goto fail; } - #else +#else struct tm *tms_tmp; - tms_tmp = localtime(ts); + localtime_r(ts, tms_tmp); if (tms_tmp == NULL) { func_name = "localtime"; goto fail; } memcpy(tms, tms_tmp, sizeof(struct tm)); - #endif +#endif #else if (localtime_r(ts, tms) == NULL) { func_name = "localtime_r"; @@ -112,8 +108,10 @@ get_localtime(NPY_TIME_T *ts, struct tm *tms) return 0; fail: - PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " - "to a local time", func_name); + PyErr_Format(PyExc_OSError, + "Failed to use '%s' to convert " + "to a local time", + func_name); return -1; } @@ -125,29 +123,28 @@ get_localtime(NPY_TIME_T *ts, struct tm *tms) * Returns 0 on success, -1 on failure. */ static int -get_gmtime(NPY_TIME_T *ts, struct tm *tms) -{ +get_gmtime(NPY_TIME_T *ts, struct tm *tms) { char *func_name = ""; #if defined(_WIN32) - #if defined(_MSC_VER) && (_MSC_VER >= 1400) +#if defined(_MSC_VER) && (_MSC_VER >= 1400) if (gmtime_s(tms, ts) != 0) { func_name = "gmtime_s"; goto fail; } - #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) +#elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) if (_gmtime64_s(tms, ts) != 0) { func_name = "_gmtime64_s"; goto fail; } - #else +#else struct tm *tms_tmp; - tms_tmp = gmtime(ts); + gmtime_r(ts, tms_tmp); if (tms_tmp == NULL) { func_name = "gmtime"; goto fail; } memcpy(tms, tms_tmp, sizeof(struct tm)); - #endif +#endif #else if (gmtime_r(ts, tms) == NULL) { func_name = "gmtime_r"; @@ -170,10 +167,9 @@ get_gmtime(NPY_TIME_T *ts, struct tm *tms) * * Returns 0 on success, -1 on failure. */ -static int -convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, - const pandas_datetimestruct *dts_utc, int *out_timezone_offset) -{ +static int convert_datetimestruct_utc_to_local( + pandas_datetimestruct *out_dts_local, const pandas_datetimestruct *dts_utc, + int *out_timezone_offset) { NPY_TIME_T rawtime = 0, localrawtime; struct tm tm_; npy_int64 year_correction = 0; @@ -187,8 +183,7 @@ convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, /* 2036 is a leap year */ year_correction = out_dts_local->year - 2036; out_dts_local->year -= year_correction; - } - else { + } else { /* 2037 is not a leap year */ year_correction = out_dts_local->year - 2037; out_dts_local->year -= year_correction; @@ -239,8 +234,7 @@ convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, */ static int convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, - const pandas_datetimestruct *dts_local) -{ + const pandas_datetimestruct *dts_local) { npy_int64 year_correction = 0; /* Make a copy of the input 'dts' to modify */ @@ -252,8 +246,7 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, /* 2036 is a leap year */ year_correction = out_dts_utc->year - 2036; out_dts_utc->year -= year_correction; - } - else { + } else { /* 2037 is not a leap year */ year_correction = out_dts_utc->year - 2037; out_dts_utc->year -= year_correction; @@ -332,7 +325,8 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, /* } */ /* /\* Parse the ISO date *\/ */ -/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, */ +/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, + */ /* dts, NULL, &bestunit, NULL) < 0) { */ /* Py_DECREF(bytes); */ /* return -1; */ @@ -342,7 +336,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, /* return 0; */ /* } */ - /* * Parses (almost) standard ISO 8601 date strings. The differences are: * @@ -365,7 +358,7 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. - * 'out_local' gets set to 1 if the parsed time contains timezone, + * 'out_local' gets set to 1 if the parsed time contains timezone, * to 0 otherwise. * 'out_tzoffset' gets set to timezone offset by minutes * if the parsed time was in local time, @@ -381,16 +374,11 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * * Returns 0 on success, -1 on failure. */ -int -parse_iso_8601_datetime(char *str, int len, - PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, - pandas_datetimestruct *out, - int *out_local, - int *out_tzoffset, - PANDAS_DATETIMEUNIT *out_bestunit, - npy_bool *out_special) -{ +int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, pandas_datetimestruct *out, + int *out_local, int *out_tzoffset, + PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special) { int year_leap = 0; int i, numdigits; char *substr, sublen; @@ -417,7 +405,6 @@ parse_iso_8601_datetime(char *str, int len, out->month = 1; out->day = 1; - /* * The string "today" means take today's date in local time, and * convert it to a date representation. This date representation, if @@ -427,11 +414,9 @@ parse_iso_8601_datetime(char *str, int len, * switching to an adjacent day depending on the current time and your * timezone. */ - if (len == 5 && tolower(str[0]) == 't' && - tolower(str[1]) == 'o' && - tolower(str[2]) == 'd' && - tolower(str[3]) == 'a' && - tolower(str[4]) == 'y') { + if (len == 5 && tolower(str[0]) == 't' && tolower(str[1]) == 'o' && + tolower(str[2]) == 'd' && tolower(str[3]) == 'a' && + tolower(str[4]) == 'y') { NPY_TIME_T rawtime = 0; struct tm tm_; @@ -460,9 +445,9 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, - casting)) { - PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + if (!can_cast_datetime64_units(bestunit, unit, casting)) { + PyErr_Format(PyExc_TypeError, + "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", str, _datetime_strings[unit], npy_casting_to_string(casting)); @@ -473,9 +458,8 @@ parse_iso_8601_datetime(char *str, int len, } /* The string "now" resolves to the current UTC time */ - if (len == 3 && tolower(str[0]) == 'n' && - tolower(str[1]) == 'o' && - tolower(str[2]) == 'w') { + if (len == 3 && tolower(str[0]) == 'n' && tolower(str[1]) == 'o' && + tolower(str[2]) == 'w') { NPY_TIME_T rawtime = 0; pandas_datetime_metadata meta; @@ -503,9 +487,9 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, - casting)) { - PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + if (!can_cast_datetime64_units(bestunit, unit, casting)) { + PyErr_Format(PyExc_TypeError, + "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", str, _datetime_strings[unit], npy_casting_to_string(casting)); @@ -543,12 +527,11 @@ parse_iso_8601_datetime(char *str, int len, out->year = 0; if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && isdigit(substr[2]) && isdigit(substr[3])) { - out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + - 10 * (substr[2] - '0') + (substr[3] - '0'); + 10 * (substr[2] - '0') + (substr[3] - '0'); substr += 4; - sublen -= 4;; + sublen -= 4; } /* Negate the year if necessary */ @@ -596,8 +579,7 @@ parse_iso_8601_datetime(char *str, int len, out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; - } - else if (!has_ymd_sep) { + } else if (!has_ymd_sep) { goto parse_error; } if (out->month < 1 || out->month > 12) { @@ -610,7 +592,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0) { /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ if (!has_ymd_sep) { - goto parse_error; + goto parse_error; } if (out_local != NULL) { *out_local = 0; @@ -631,7 +613,7 @@ parse_iso_8601_datetime(char *str, int len, /* PARSE THE DAY */ /* First digit required */ if (!isdigit(*substr)) { - goto parse_error; + goto parse_error; } out->day = (*substr - '0'); ++substr; @@ -641,13 +623,11 @@ parse_iso_8601_datetime(char *str, int len, out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; - } - else if (!has_ymd_sep) { + } else if (!has_ymd_sep) { goto parse_error; } if (out->day < 1 || - out->day > days_per_month_table[year_leap][out->month-1]) - { + out->day > days_per_month_table[year_leap][out->month - 1]) { PyErr_Format(PyExc_ValueError, "Day out of range in datetime string \"%s\"", str); goto error; @@ -684,7 +664,7 @@ parse_iso_8601_datetime(char *str, int len, --sublen; if (out->hour >= 24) { PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); + "Hours out of range in datetime string \"%s\"", str); goto error; } } @@ -706,8 +686,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - } - else if (!isdigit(*substr)) { + } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; } @@ -730,8 +709,7 @@ parse_iso_8601_datetime(char *str, int len, "Minutes out of range in datetime string \"%s\"", str); goto error; } - } - else if (!has_hms_sep) { + } else if (!has_hms_sep) { goto parse_error; } @@ -749,10 +727,8 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - } - else if (!has_hms_sep && isdigit(*substr)) { - } - else { + } else if (!has_hms_sep && isdigit(*substr)) { + } else { bestunit = PANDAS_FR_m; goto parse_timezone; } @@ -772,8 +748,7 @@ parse_iso_8601_datetime(char *str, int len, "Seconds out of range in datetime string \"%s\"", str); goto error; } - } - else if (!has_hms_sep) { + } else if (!has_hms_sep) { goto parse_error; } @@ -781,8 +756,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - } - else { + } else { bestunit = PANDAS_FR_s; goto parse_timezone; } @@ -791,7 +765,7 @@ parse_iso_8601_datetime(char *str, int len, numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; - if (sublen > 0 && isdigit(*substr)) { + if (sublen > 0 && isdigit(*substr)) { out->us += (*substr - '0'); ++substr; --sublen; @@ -802,8 +776,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { bestunit = PANDAS_FR_us; - } - else { + } else { bestunit = PANDAS_FR_ms; } goto parse_timezone; @@ -824,8 +797,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { bestunit = PANDAS_FR_ps; - } - else { + } else { bestunit = PANDAS_FR_ns; } goto parse_timezone; @@ -845,16 +817,15 @@ parse_iso_8601_datetime(char *str, int len, if (numdigits > 3) { bestunit = PANDAS_FR_as; - } - else { + } else { bestunit = PANDAS_FR_fs; } parse_timezone: /* trim any whitepsace between time/timeezone */ while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; + ++substr; + --sublen; } if (sublen == 0) { @@ -871,18 +842,16 @@ parse_iso_8601_datetime(char *str, int len, if (out_tzoffset != NULL) { *out_tzoffset = 0; - } + } if (sublen == 1) { goto finish; - } - else { + } else { ++substr; --sublen; } - } - /* Time zone offset */ - else if (*substr == '-' || *substr == '+') { + } else if (*substr == '-' || *substr == '+') { + /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; /* @@ -903,17 +872,16 @@ parse_iso_8601_datetime(char *str, int len, sublen -= 2; if (offset_hour >= 24) { PyErr_Format(PyExc_ValueError, - "Timezone hours offset out of range " - "in datetime string \"%s\"", str); + "Timezone hours offset out of range " + "in datetime string \"%s\"", + str); goto error; } - } - else if (sublen >= 1 && isdigit(substr[0])) { + } else if (sublen >= 1 && isdigit(substr[0])) { offset_hour = substr[0] - '0'; ++substr; --sublen; - } - else { + } else { goto parse_error; } @@ -932,17 +900,16 @@ parse_iso_8601_datetime(char *str, int len, sublen -= 2; if (offset_minute >= 60) { PyErr_Format(PyExc_ValueError, - "Timezone minutes offset out of range " - "in datetime string \"%s\"", str); + "Timezone minutes offset out of range " + "in datetime string \"%s\"", + str); goto error; } - } - else if (sublen >= 1 && isdigit(substr[0])) { + } else if (sublen >= 1 && isdigit(substr[0])) { offset_minute = substr[0] - '0'; ++substr; --sublen; - } - else { + } else { goto parse_error; } } @@ -975,9 +942,9 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, - casting)) { - PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + if (!can_cast_datetime64_units(bestunit, unit, casting)) { + PyErr_Format(PyExc_TypeError, + "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", str, _datetime_strings[unit], npy_casting_to_string(casting)); @@ -988,8 +955,8 @@ parse_iso_8601_datetime(char *str, int len, parse_error: PyErr_Format(PyExc_ValueError, - "Error parsing datetime string \"%s\" at position %d", - str, (int)(substr-str)); + "Error parsing datetime string \"%s\" at position %d", str, + (int)(substr - str)); return -1; error: @@ -1000,9 +967,7 @@ parse_iso_8601_datetime(char *str, int len, * Provides a string length to use for converting datetime * objects with the given local and unit settings. */ -int -get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) -{ +int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { int len = 0; switch (base) { @@ -1010,28 +975,28 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) /*case PANDAS_FR_GENERIC:*/ /* return 4;*/ case PANDAS_FR_as: - len += 3; /* "###" */ + len += 3; /* "###" */ case PANDAS_FR_fs: - len += 3; /* "###" */ + len += 3; /* "###" */ case PANDAS_FR_ps: - len += 3; /* "###" */ + len += 3; /* "###" */ case PANDAS_FR_ns: - len += 3; /* "###" */ + len += 3; /* "###" */ case PANDAS_FR_us: - len += 3; /* "###" */ + len += 3; /* "###" */ case PANDAS_FR_ms: - len += 4; /* ".###" */ + len += 4; /* ".###" */ case PANDAS_FR_s: - len += 3; /* ":##" */ + len += 3; /* ":##" */ case PANDAS_FR_m: - len += 3; /* ":##" */ + len += 3; /* ":##" */ case PANDAS_FR_h: - len += 3; /* "T##" */ + len += 3; /* "T##" */ case PANDAS_FR_D: case PANDAS_FR_W: - len += 3; /* "-##" */ + len += 3; /* "-##" */ case PANDAS_FR_M: - len += 3; /* "-##" */ + len += 3; /* "-##" */ case PANDAS_FR_Y: len += 21; /* 64-bit year */ break; @@ -1042,10 +1007,9 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) if (base >= PANDAS_FR_h) { if (local) { - len += 5; /* "+####" or "-####" */ - } - else { - len += 1; /* "Z" */ + len += 5; /* "+####" or "-####" */ + } else { + len += 1; /* "Z" */ } } @@ -1058,43 +1022,31 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) * Finds the largest unit whose value is nonzero, and for which * the remainder for the rest of the units is zero. */ -static PANDAS_DATETIMEUNIT -lossless_unit_from_datetimestruct(pandas_datetimestruct *dts) -{ +static PANDAS_DATETIMEUNIT lossless_unit_from_datetimestruct( + pandas_datetimestruct *dts) { if (dts->as % 1000 != 0) { return PANDAS_FR_as; - } - else if (dts->as != 0) { + } else if (dts->as != 0) { return PANDAS_FR_fs; - } - else if (dts->ps % 1000 != 0) { + } else if (dts->ps % 1000 != 0) { return PANDAS_FR_ps; - } - else if (dts->ps != 0) { + } else if (dts->ps != 0) { return PANDAS_FR_ns; - } - else if (dts->us % 1000 != 0) { + } else if (dts->us % 1000 != 0) { return PANDAS_FR_us; - } - else if (dts->us != 0) { + } else if (dts->us != 0) { return PANDAS_FR_ms; - } - else if (dts->sec != 0) { + } else if (dts->sec != 0) { return PANDAS_FR_s; - } - else if (dts->min != 0) { + } else if (dts->min != 0) { return PANDAS_FR_m; - } - else if (dts->hour != 0) { + } else if (dts->hour != 0) { return PANDAS_FR_h; - } - else if (dts->day != 1) { + } else if (dts->day != 1) { return PANDAS_FR_D; - } - else if (dts->month != 1) { + } else if (dts->month != 1) { return PANDAS_FR_M; - } - else { + } else { return PANDAS_FR_Y; } } @@ -1125,11 +1077,9 @@ lossless_unit_from_datetimestruct(pandas_datetimestruct *dts) * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int -make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting) -{ +int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) { pandas_datetimestruct dts_local; int timezone_offset = 0; @@ -1160,10 +1110,9 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, /* Set dts to point to our local time instead of the UTC time */ dts = &dts_local; - } - /* Use the manually provided tzoffset */ - else if (local) { - /* Make a copy of the pandas_datetimestruct we can modify */ + } else if (local) { + // Use the manually provided tzoffset. + // Make a copy of the pandas_datetimestruct we can modify. dts_local = *dts; dts = &dts_local; @@ -1180,22 +1129,23 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, if (casting != NPY_UNSAFE_CASTING) { /* Producing a date as a local time is always 'unsafe' */ if (base <= PANDAS_FR_D && local) { - PyErr_SetString(PyExc_TypeError, "Cannot create a local " - "timezone-based date string from a NumPy " - "datetime without forcing 'unsafe' casting"); + PyErr_SetString(PyExc_TypeError, + "Cannot create a local " + "timezone-based date string from a NumPy " + "datetime without forcing 'unsafe' casting"); return -1; - } - /* Only 'unsafe' and 'same_kind' allow data loss */ - else { + } else { + /* Only 'unsafe' and 'same_kind' allow data loss */ PANDAS_DATETIMEUNIT unitprec; unitprec = lossless_unit_from_datetimestruct(dts); if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { - PyErr_Format(PyExc_TypeError, "Cannot create a " - "string with unit precision '%s' " - "from the NumPy datetime, which has data at " - "unit precision '%s', " - "requires 'unsafe' or 'same_kind' casting", + PyErr_Format(PyExc_TypeError, + "Cannot create a " + "string with unit precision '%s' " + "from the NumPy datetime, which has data at " + "unit precision '%s', " + "requires 'unsafe' or 'same_kind' casting", _datetime_strings[base], _datetime_strings[unitprec]); return -1; @@ -1203,12 +1153,12 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } } - /* YEAR */ - /* - * Can't use PyOS_snprintf, because it always produces a '\0' - * character at the end, and NumPy string types are permitted - * to have data all the way to the end of the buffer. - */ +/* YEAR */ +/* + * Can't use PyOS_snprintf, because it always produces a '\0' + * character at the end, and NumPy string types are permitted + * to have data all the way to the end of the buffer. + */ #ifdef _WIN32 tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #else @@ -1230,15 +1180,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* MONTH */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = '-'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->month / 10) + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->month % 10) + '0'); @@ -1254,15 +1204,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* DAY */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = '-'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->day / 10) + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->day % 10) + '0'); @@ -1278,15 +1228,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* HOUR */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = 'T'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->hour / 10) + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->hour % 10) + '0'); @@ -1299,15 +1249,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* MINUTE */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = ':'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->min / 10) + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->min % 10) + '0'); @@ -1320,15 +1270,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* SECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = ':'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->sec / 10) + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->sec % 10) + '0'); @@ -1341,19 +1291,19 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* MILLISECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = '.'; - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->us / 100000) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->us / 10000) % 10 + '0'); - if (sublen < 4 ) { + if (sublen < 4) { goto string_too_short; } substr[3] = (char)((dts->us / 1000) % 10 + '0'); @@ -1366,15 +1316,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* MICROSECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = (char)((dts->us / 100) % 10 + '0'); - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->us / 10) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)(dts->us % 10 + '0'); @@ -1387,15 +1337,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* NANOSECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = (char)((dts->ps / 100000) % 10 + '0'); - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->ps / 10000) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->ps / 1000) % 10 + '0'); @@ -1408,15 +1358,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* PICOSECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = (char)((dts->ps / 100) % 10 + '0'); - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->ps / 10) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)(dts->ps % 10 + '0'); @@ -1429,15 +1379,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* FEMTOSECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = (char)((dts->as / 100000) % 10 + '0'); - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->as / 10000) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)((dts->as / 1000) % 10 + '0'); @@ -1450,15 +1400,15 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, } /* ATTOSECOND */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } substr[0] = (char)((dts->as / 100) % 10 + '0'); - if (sublen < 2 ) { + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((dts->as / 10) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)(dts->as % 10 + '0'); @@ -1474,35 +1424,33 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, if (timezone_offset < 0) { substr[0] = '-'; timezone_offset = -timezone_offset; - } - else { + } else { substr[0] = '+'; } substr += 1; sublen -= 1; /* Add the timezone offset */ - if (sublen < 1 ) { + if (sublen < 1) { goto string_too_short; } - substr[0] = (char)((timezone_offset / (10*60)) % 10 + '0'); - if (sublen < 2 ) { + substr[0] = (char)((timezone_offset / (10 * 60)) % 10 + '0'); + if (sublen < 2) { goto string_too_short; } substr[1] = (char)((timezone_offset / 60) % 10 + '0'); - if (sublen < 3 ) { + if (sublen < 3) { goto string_too_short; } substr[2] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); - if (sublen < 4 ) { + if (sublen < 4) { goto string_too_short; } substr[3] = (char)((timezone_offset % 60) % 10 + '0'); substr += 4; sublen -= 4; - } - /* UTC "Zulu" time */ - else { + } else { + /* UTC "Zulu" time */ if (sublen < 1) { goto string_too_short; } @@ -1520,8 +1468,8 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, string_too_short: PyErr_Format(PyExc_RuntimeError, - "The string provided for NumPy ISO datetime formatting " - "was too short, with length %d", - outlen); + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %d", + outlen); return -1; } diff --git a/pandas/src/datetime/np_datetime_strings.h b/pandas/src/datetime/np_datetime_strings.h index 0d9a0944310fb..1114ec5eae064 100644 --- a/pandas/src/datetime/np_datetime_strings.h +++ b/pandas/src/datetime/np_datetime_strings.h @@ -1,9 +1,26 @@ /* - * This is derived from numpy 1.7. See NP_LICENSE.txt - */ -#ifndef _NPY_PRIVATE__DATETIME_STRINGS_H_ -#define _NPY_PRIVATE__DATETIME_STRINGS_H_ +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +Written by Mark Wiebe (mwwiebe@gmail.com) +Copyright (c) 2011 by Enthought, Inc. + +Copyright (c) 2005-2011, NumPy Developers +All rights reserved. + +See NUMPY_LICENSE.txt for the license. + +This file implements string parsing and creation for NumPy datetime. + +*/ + +#ifndef PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ +#define PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -86,4 +103,4 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, int local, PANDAS_DATETIMEUNIT base, int tzoffset, NPY_CASTING casting); -#endif +#endif // PANDAS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/src/datetime_helper.h b/pandas/src/datetime_helper.h index 11399181fa4e7..2b24028ff3d8c 100644 --- a/pandas/src/datetime_helper.h +++ b/pandas/src/datetime_helper.h @@ -1,7 +1,19 @@ +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_DATETIME_HELPER_H_ +#define PANDAS_SRC_DATETIME_HELPER_H_ + +#include #include "datetime.h" #include "numpy/arrayobject.h" #include "numpy/arrayscalars.h" -#include #if PY_MAJOR_VERSION >= 3 #define PyInt_AS_LONG PyLong_AsLong @@ -10,7 +22,8 @@ npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyInt_AS_LONG(value)); + long_val = (PyLong_Check(value) ? + PyLong_AsLongLong(value) : PyInt_AS_LONG(value)); Py_DECREF(value); return long_val; } @@ -23,3 +36,5 @@ npy_float64 total_seconds(PyObject *td) { npy_int64 days_in_seconds = days * 24LL * 3600LL; return (microseconds + (seconds + days_in_seconds) * 1000000.0) / 1000000.0; } + +#endif // PANDAS_SRC_DATETIME_HELPER_H_ diff --git a/pandas/src/helper.h b/pandas/src/helper.h index b8c3cecbb2dc7..39bcf27e074df 100644 --- a/pandas/src/helper.h +++ b/pandas/src/helper.h @@ -1,16 +1,25 @@ -#ifndef C_HELPER_H -#define C_HELPER_H +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_HELPER_H_ +#define PANDAS_SRC_HELPER_H_ #ifndef PANDAS_INLINE #if defined(__GNUC__) #define PANDAS_INLINE static __inline__ #elif defined(_MSC_VER) #define PANDAS_INLINE static __inline - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define PANDAS_INLINE static inline #else #define PANDAS_INLINE #endif #endif -#endif +#endif // PANDAS_SRC_HELPER_H_ diff --git a/pandas/src/numpy_helper.h b/pandas/src/numpy_helper.h index 9f406890c4e68..17d5ec12f4f79 100644 --- a/pandas/src/numpy_helper.h +++ b/pandas/src/numpy_helper.h @@ -1,7 +1,19 @@ +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_NUMPY_HELPER_H_ +#define PANDAS_SRC_NUMPY_HELPER_H_ + #include "Python.h" +#include "helper.h" #include "numpy/arrayobject.h" #include "numpy/arrayscalars.h" -#include "helper.h" #define PANDAS_FLOAT 0 #define PANDAS_INT 1 @@ -10,111 +22,87 @@ #define PANDAS_OBJECT 4 #define PANDAS_DATETIME 5 -PANDAS_INLINE int -infer_type(PyObject* obj) { - if (PyBool_Check(obj)) { - return PANDAS_BOOL; - } - else if (PyArray_IsIntegerScalar(obj)) { - return PANDAS_INT; - } - else if (PyArray_IsScalar(obj, Datetime)) { - return PANDAS_DATETIME; - } - else if (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)) { - return PANDAS_FLOAT; - } - else if (PyString_Check(obj) || PyUnicode_Check(obj)) { - return PANDAS_STRING; - } - else { - return PANDAS_OBJECT; - } +PANDAS_INLINE int infer_type(PyObject* obj) { + if (PyBool_Check(obj)) { + return PANDAS_BOOL; + } else if (PyArray_IsIntegerScalar(obj)) { + return PANDAS_INT; + } else if (PyArray_IsScalar(obj, Datetime)) { + return PANDAS_DATETIME; + } else if (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)) { + return PANDAS_FLOAT; + } else if (PyString_Check(obj) || PyUnicode_Check(obj)) { + return PANDAS_STRING; + } else { + return PANDAS_OBJECT; + } } -PANDAS_INLINE npy_int64 -get_nat(void) { - return NPY_MIN_INT64; -} +PANDAS_INLINE npy_int64 get_nat(void) { return NPY_MIN_INT64; } -PANDAS_INLINE npy_datetime -get_datetime64_value(PyObject* obj) { - return ((PyDatetimeScalarObject*) obj)->obval; +PANDAS_INLINE npy_datetime get_datetime64_value(PyObject* obj) { + return ((PyDatetimeScalarObject*)obj)->obval; } -PANDAS_INLINE npy_timedelta -get_timedelta64_value(PyObject* obj) { - return ((PyTimedeltaScalarObject*) obj)->obval; +PANDAS_INLINE npy_timedelta get_timedelta64_value(PyObject* obj) { + return ((PyTimedeltaScalarObject*)obj)->obval; } -PANDAS_INLINE int -is_integer_object(PyObject* obj) { - return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); -// return PyArray_IsIntegerScalar(obj); +PANDAS_INLINE int is_integer_object(PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } -PANDAS_INLINE int -is_float_object(PyObject* obj) { - return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); +PANDAS_INLINE int is_float_object(PyObject* obj) { + return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); } -PANDAS_INLINE int -is_complex_object(PyObject* obj) { - return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); +PANDAS_INLINE int is_complex_object(PyObject* obj) { + return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); } -PANDAS_INLINE int -is_bool_object(PyObject* obj) { - return (PyBool_Check(obj) || PyArray_IsScalar(obj, Bool)); +PANDAS_INLINE int is_bool_object(PyObject* obj) { + return (PyBool_Check(obj) || PyArray_IsScalar(obj, Bool)); } -PANDAS_INLINE int -is_string_object(PyObject* obj) { - return (PyString_Check(obj) || PyUnicode_Check(obj)); +PANDAS_INLINE int is_string_object(PyObject* obj) { + return (PyString_Check(obj) || PyUnicode_Check(obj)); } -PANDAS_INLINE int -is_datetime64_object(PyObject *obj) { - return PyArray_IsScalar(obj, Datetime); +PANDAS_INLINE int is_datetime64_object(PyObject* obj) { + return PyArray_IsScalar(obj, Datetime); } -PANDAS_INLINE int -is_timedelta64_object(PyObject *obj) { - return PyArray_IsScalar(obj, Timedelta); +PANDAS_INLINE int is_timedelta64_object(PyObject* obj) { + return PyArray_IsScalar(obj, Timedelta); } -PANDAS_INLINE int -assign_value_1d(PyArrayObject* ap, Py_ssize_t _i, PyObject* v) { - npy_intp i = (npy_intp) _i; - char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); - return PyArray_DESCR(ap)->f->setitem(v, item, ap); +PANDAS_INLINE int assign_value_1d(PyArrayObject* ap, Py_ssize_t _i, + PyObject* v) { + npy_intp i = (npy_intp)_i; + char* item = (char*)PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_DESCR(ap)->f->setitem(v, item, ap); } -PANDAS_INLINE PyObject* -get_value_1d(PyArrayObject* ap, Py_ssize_t i) { - char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); - return PyArray_Scalar(item, PyArray_DESCR(ap), (PyObject*) ap); +PANDAS_INLINE PyObject* get_value_1d(PyArrayObject* ap, Py_ssize_t i) { + char* item = (char*)PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_Scalar(item, PyArray_DESCR(ap), (PyObject*)ap); } - -PANDAS_INLINE char* -get_c_string(PyObject* obj) { +PANDAS_INLINE char* get_c_string(PyObject* obj) { #if PY_VERSION_HEX >= 0x03000000 - PyObject* enc_str = PyUnicode_AsEncodedString(obj, "utf-8", "error"); + PyObject* enc_str = PyUnicode_AsEncodedString(obj, "utf-8", "error"); - char *ret; - ret = PyBytes_AS_STRING(enc_str); + char* ret; + ret = PyBytes_AS_STRING(enc_str); - // TODO: memory leak here + // TODO(general): memory leak here - // Py_XDECREF(enc_str); - return ret; + return ret; #else - return PyString_AsString(obj); + return PyString_AsString(obj); #endif } -PANDAS_INLINE PyObject* -char_to_string(char* data) { +PANDAS_INLINE PyObject* char_to_string(char* data) { #if PY_VERSION_HEX >= 0x03000000 return PyUnicode_FromString(data); #else @@ -122,61 +110,47 @@ char_to_string(char* data) { #endif } -// PANDAS_INLINE int -// is_string(PyObject* obj) { -// #if PY_VERSION_HEX >= 0x03000000 -// return PyUnicode_Check(obj); -// #else -// return PyString_Check(obj); -// #endif - -PyObject* sarr_from_data(PyArray_Descr *descr, int length, void* data) { - PyArrayObject *result; +PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) { + PyArrayObject* result; npy_intp dims[1] = {length}; - Py_INCREF(descr); // newfromdescr steals a reference to descr - result = (PyArrayObject*) PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, - NULL, data, 0, NULL); + Py_INCREF(descr); // newfromdescr steals a reference to descr + result = (PyArrayObject*)PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, + NULL, data, 0, NULL); // Returned array doesn't own data by default result->flags |= NPY_OWNDATA; - return (PyObject*) result; + return (PyObject*)result; } - -void transfer_object_column(char *dst, char *src, size_t stride, +void transfer_object_column(char* dst, char* src, size_t stride, size_t length) { int i; size_t sz = sizeof(PyObject*); - for (i = 0; i < length; ++i) - { + for (i = 0; i < length; ++i) { // uninitialized data // Py_XDECREF(*((PyObject**) dst)); memcpy(dst, src, sz); - Py_INCREF(*((PyObject**) dst)); + Py_INCREF(*((PyObject**)dst)); src += sz; dst += stride; } } -void set_array_owndata(PyArrayObject *ao) { - ao->flags |= NPY_OWNDATA; -} +void set_array_owndata(PyArrayObject* ao) { ao->flags |= NPY_OWNDATA; } -void set_array_not_contiguous(PyArrayObject *ao) { +void set_array_not_contiguous(PyArrayObject* ao) { ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); } - // If arr is zerodim array, return a proper array scalar (e.g. np.int64). // Otherwise, return arr as is. -PANDAS_INLINE PyObject* -unbox_if_zerodim(PyObject* arr) { +PANDAS_INLINE PyObject* unbox_if_zerodim(PyObject* arr) { if (PyArray_IsZeroDim(arr)) { - PyObject *ret; + PyObject* ret; ret = PyArray_ToScalar(PyArray_DATA(arr), arr); return ret; } else { @@ -185,20 +159,4 @@ unbox_if_zerodim(PyObject* arr) { } } - -// PANDAS_INLINE PyObject* -// get_base_ndarray(PyObject* ap) { -// // if (!ap || (NULL == ap)) { -// // Py_RETURN_NONE; -// // } - -// while (!PyArray_CheckExact(ap)) { -// ap = PyArray_BASE((PyArrayObject*) ap); -// if (ap == Py_None) Py_RETURN_NONE; -// } -// // PyArray_BASE is a borrowed reference -// if(ap) { -// Py_INCREF(ap); -// } -// return ap; -// } +#endif // PANDAS_SRC_NUMPY_HELPER_H_ diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index e565f02f27c88..5d2a0dad3da17 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -1,3 +1,15 @@ +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. +*/ + +#ifndef PANDAS_SRC_PARSE_HELPER_H_ +#define PANDAS_SRC_PARSE_HELPER_H_ + #include #include #include "headers/portable.h" @@ -5,8 +17,8 @@ static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing, int *maybe_int); -int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int) -{ +int to_double(char *item, double *p_value, char sci, char decimal, + int *maybe_int) { char *p_end = NULL; *p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int); @@ -15,14 +27,14 @@ int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_in } #if PY_VERSION_HEX < 0x02060000 - #define PyBytes_Check PyString_Check - #define PyBytes_AS_STRING PyString_AS_STRING +#define PyBytes_Check PyString_Check +#define PyBytes_AS_STRING PyString_AS_STRING #endif -int floatify(PyObject* str, double *result, int *maybe_int) { +int floatify(PyObject *str, double *result, int *maybe_int) { int status; char *data; - PyObject* tmp = NULL; + PyObject *tmp = NULL; const char sci = 'E'; const char dec = '.'; @@ -70,17 +82,15 @@ int floatify(PyObject* str, double *result, int *maybe_int) { Py_XDECREF(tmp); return -1; -/* -#if PY_VERSION_HEX >= 0x03000000 - return PyFloat_FromString(str); -#else - return PyFloat_FromString(str, NULL); -#endif -*/ - + /* + #if PY_VERSION_HEX >= 0x03000000 + return PyFloat_FromString(str); + #else + return PyFloat_FromString(str, NULL); + #endif + */ } - // --------------------------------------------------------------------------- // Implementation of xstrtod @@ -104,10 +114,12 @@ int floatify(PyObject* str, double *result, int *maybe_int) { // may be used to endorse or promote products derived from this software // without specific prior written permission. // -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -125,149 +137,137 @@ int floatify(PyObject* str, double *result, int *maybe_int) { // PANDAS_INLINE void lowercase(char *p) { - for ( ; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = tolower(*p); } PANDAS_INLINE void uppercase(char *p) { - for ( ; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper(*p); } +static double xstrtod(const char *str, char **endptr, char decimal, char sci, + int skip_trailing, int *maybe_int) { + double number; + int exponent; + int negative; + char *p = (char *)str; + double p10; + int n; + int num_digits; + int num_decimals; + + errno = 0; + *maybe_int = 1; -static double xstrtod(const char *str, char **endptr, char decimal, - char sci, int skip_trailing, int *maybe_int) -{ - double number; - int exponent; - int negative; - char *p = (char *) str; - double p10; - int n; - int num_digits; - int num_decimals; - - errno = 0; - *maybe_int = 1; - - // Skip leading whitespace - while (isspace(*p)) p++; - - // Handle optional sign - negative = 0; - switch (*p) - { - case '-': negative = 1; // Fall through to increment position - case '+': p++; - } - - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - - // Process string of digits - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - } - - // Process decimal part - if (*p == decimal) - { - *maybe_int = 0; - p++; - - while (isdigit(*p)) - { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; + // Skip leading whitespace + while (isspace(*p)) p++; + + // Handle optional sign + negative = 0; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position + case '+': + p++; } - exponent -= num_decimals; - } + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; - if (num_digits == 0) - { - errno = ERANGE; - return 0.0; - } + // Process string of digits + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + } - // Correct for sign - if (negative) number = -number; + // Process decimal part + if (*p == decimal) { + *maybe_int = 0; + p++; - // Process an exponent string - if (toupper(*p) == toupper(sci)) - { - *maybe_int = 0; + while (isdigit(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } - // Handle optional sign - negative = 0; - switch (*++p) - { - case '-': negative = 1; // Fall through to increment pos - case '+': p++; + exponent -= num_decimals; } - // Process string of digits - num_digits = 0; - n = 0; - while (isdigit(*p)) - { - n = n * 10 + (*p - '0'); - num_digits++; - p++; + if (num_digits == 0) { + errno = ERANGE; + return 0.0; } - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; - } - - - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) - { - - errno = ERANGE; - return HUGE_VAL; - } - - // Scale the result - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) - { - if (n & 1) - { - if (exponent < 0) - number /= p10; - else - number *= p10; + // Correct for sign + if (negative) number = -number; + + // Process an exponent string + if (toupper(*p) == toupper(sci)) { + *maybe_int = 0; + + // Handle optional sign + negative = 0; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos + case '+': + p++; + } + + // Process string of digits + num_digits = 0; + n = 0; + while (isdigit(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) p--; } - n >>= 1; - p10 *= p10; - } + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { + errno = ERANGE; + return HUGE_VAL; + } - if (number == HUGE_VAL) { - errno = ERANGE; - } + // Scale the result + p10 = 10.; + n = exponent; + if (n < 0) n = -n; + while (n) { + if (n & 1) { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } - if (skip_trailing) { - // Skip trailing whitespace - while (isspace(*p)) p++; - } + if (number == HUGE_VAL) { + errno = ERANGE; + } - if (endptr) *endptr = p; + if (skip_trailing) { + // Skip trailing whitespace + while (isspace(*p)) p++; + } + if (endptr) *endptr = p; - return number; + return number; } + +#endif // PANDAS_SRC_PARSE_HELPER_H_ diff --git a/pandas/src/period_helper.c b/pandas/src/period_helper.c index 6078be6fc3d19..19f810eb54ea7 100644 --- a/pandas/src/period_helper.c +++ b/pandas/src/period_helper.c @@ -1,30 +1,37 @@ -#include "period_helper.h" +/* +Copyright (c) 2016, PyData Development Team +All rights reserved. +Distributed under the terms of the BSD Simplified License. -/* - * Borrowed and derived code from scikits.timeseries that we will expose via - * Cython to pandas. This primarily concerns period representation and - * frequency conversion routines. - */ +The full license is in the LICENSE file, distributed with this software. -/* see end of file for stuff pandas uses (search for 'pandas') */ +Borrowed and derived code from scikits.timeseries that we will expose via +Cython to pandas. This primarily concerns interval representation and +frequency conversion routines. + +See end of file for stuff pandas uses (search for 'pandas'). +*/ + +#include "period_helper.h" /* ------------------------------------------------------------------ * Code derived from scikits.timeseries * ------------------------------------------------------------------*/ static int mod_compat(int x, int m) { - int result = x % m; - if (result < 0) return result + m; - return result; + int result = x % m; + if (result < 0) return result + m; + return result; } static int floordiv(int x, int divisor) { if (x < 0) { if (mod_compat(x, divisor)) { return x / divisor - 1; + } else { + return x / divisor; } - else return x / divisor; } else { return x / divisor; } @@ -32,19 +39,16 @@ static int floordiv(int x, int divisor) { /* Table with day offsets for each month (0-based, without and with leap) */ static int month_offset[2][13] = { - { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, - { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } -}; + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}}; /* Table of number of days in a month (0-based, without and with leap) */ static int days_in_month[2][12] = { - { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, - { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } -}; + {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; /* Return 1/0 iff year points to a leap year in calendar. */ -static int dInfoCalc_Leapyear(npy_int64 year, int calendar) -{ +static int dInfoCalc_Leapyear(npy_int64 year, int calendar) { if (calendar == GREGORIAN_CALENDAR) { return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); } else { @@ -53,8 +57,7 @@ static int dInfoCalc_Leapyear(npy_int64 year, int calendar) } /* Return the day of the week for the given absolute date. */ -static int dInfoCalc_DayOfWeek(npy_int64 absdate) -{ +static int dInfoCalc_DayOfWeek(npy_int64 absdate) { int day_of_week; if (absdate >= 1) { @@ -65,7 +68,7 @@ static int dInfoCalc_DayOfWeek(npy_int64 absdate) return day_of_week; } -static int monthToQuarter(int month) { return ((month-1)/3)+1; } +static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } /* Return the year offset, that is the absolute date of the day 31.12.(year-1) in the given calendar. @@ -75,23 +78,22 @@ static int monthToQuarter(int month) { return ((month-1)/3)+1; } using the Gregorian Epoch) value by two days because the Epoch (0001-01-01) in the Julian calendar lies 2 days before the Epoch in the Gregorian calendar. */ -static int dInfoCalc_YearOffset(npy_int64 year, int calendar) -{ +static int dInfoCalc_YearOffset(npy_int64 year, int calendar) { year--; if (calendar == GREGORIAN_CALENDAR) { - if (year >= 0 || -1/4 == -1) - return year*365 + year/4 - year/100 + year/400; - else - return year*365 + (year-3)/4 - (year-99)/100 + (year-399)/400; - } - else if (calendar == JULIAN_CALENDAR) { - if (year >= 0 || -1/4 == -1) - return year*365 + year/4 - 2; - else - return year*365 + (year-3)/4 - 2; + if (year >= 0 || -1 / 4 == -1) + return year * 365 + year / 4 - year / 100 + year / 400; + else + return year * 365 + (year - 3) / 4 - (year - 99) / 100 + + (year - 399) / 400; + } else if (calendar == JULIAN_CALENDAR) { + if (year >= 0 || -1 / 4 == -1) + return year * 365 + year / 4 - 2; + else + return year * 365 + (year - 3) / 4 - 2; } Py_Error(PyExc_ValueError, "unknown calendar"); - onError: +onError: return INT_ERR_CODE; } @@ -99,39 +101,32 @@ static int dInfoCalc_YearOffset(npy_int64 year, int calendar) * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar * to be used. */ -static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, - int year, int month, int day, int hour, int minute, double second, - int calendar) -{ - +static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, + int month, int day, int hour, + int minute, double second, + int calendar) { /* Calculate the absolute date */ { int leap; - npy_int64 absdate; + npy_int64 absdate; int yearoffset; /* Range check */ - Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), - PyExc_ValueError, - "year out of range: %i", - year); + Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), + PyExc_ValueError, "year out of range: %i", year); /* Is it a leap year ? */ leap = dInfoCalc_Leapyear(year, calendar); /* Negative month values indicate months relative to the years end */ if (month < 0) month += 13; - Py_AssertWithArg(month >= 1 && month <= 12, - PyExc_ValueError, - "month out of range (1-12): %i", - month); + Py_AssertWithArg(month >= 1 && month <= 12, PyExc_ValueError, + "month out of range (1-12): %i", month); /* Negative values indicate days relative to the months end */ if (day < 0) day += days_in_month[leap][month - 1] + 1; Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], - PyExc_ValueError, - "day out of range: %i", - day); + PyExc_ValueError, "day out of range: %i", day); yearoffset = dInfoCalc_YearOffset(year, calendar); if (yearoffset == INT_ERR_CODE) goto onError; @@ -142,7 +137,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, dinfo->year = year; dinfo->month = month; - dinfo->quarter = ((month-1)/3)+1; + dinfo->quarter = ((month - 1) / 3) + 1; dinfo->day = day; dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); @@ -153,23 +148,18 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, /* Calculate the absolute time */ { - Py_AssertWithArg(hour >= 0 && hour <= 23, - PyExc_ValueError, - "hour out of range (0-23): %i", - hour); - Py_AssertWithArg(minute >= 0 && minute <= 59, - PyExc_ValueError, - "minute out of range (0-59): %i", - minute); - Py_AssertWithArg(second >= (double)0.0 && + Py_AssertWithArg(hour >= 0 && hour <= 23, PyExc_ValueError, + "hour out of range (0-23): %i", hour); + Py_AssertWithArg(minute >= 0 && minute <= 59, PyExc_ValueError, + "minute out of range (0-59): %i", minute); + Py_AssertWithArg( + second >= (double)0.0 && (second < (double)60.0 || - (hour == 23 && minute == 59 && - second < (double)61.0)), - PyExc_ValueError, - "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", - second); + (hour == 23 && minute == 59 && second < (double)61.0)), + PyExc_ValueError, + "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", second); - dinfo->abstime = (double)(hour*3600 + minute*60) + second; + dinfo->abstime = (double)(hour * 3600 + minute * 60) + second; dinfo->hour = hour; dinfo->minute = minute; @@ -177,7 +167,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, } return 0; - onError: +onError: return INT_ERR_CODE; } @@ -186,13 +176,11 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, XXX This could also be done using some integer arithmetics rather than with this iterative approach... */ -static -int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 absdate, int calendar) -{ +static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, + npy_int64 absdate, int calendar) { register npy_int64 year; npy_int64 yearoffset; - int leap,dayoffset; + int leap, dayoffset; int *monthoffset; /* Approximate year */ @@ -220,7 +208,7 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, } dayoffset = absdate - yearoffset; - leap = dInfoCalc_Leapyear(year,calendar); + leap = dInfoCalc_Leapyear(year, calendar); /* Forward correction: non leap years only have 365 days */ if (dayoffset > 365 && !leap) { @@ -239,23 +227,21 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, register int month; for (month = 1; month < 13; month++) { - if (monthoffset[month] >= dayoffset) - break; + if (monthoffset[month] >= dayoffset) break; } dinfo->month = month; dinfo->quarter = monthToQuarter(month); - dinfo->day = dayoffset - month_offset[leap][month-1]; + dinfo->day = dayoffset - month_offset[leap][month - 1]; } - dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); dinfo->day_of_year = dayoffset; dinfo->absdate = absdate; return 0; - onError: +onError: return INT_ERR_CODE; } @@ -269,39 +255,25 @@ int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, // helpers for frequency conversion routines // static int daytime_conversion_factors[][2] = { - { FR_DAY, 1 }, - { FR_HR, 24 }, - { FR_MIN, 60 }, - { FR_SEC, 60 }, - { FR_MS, 1000 }, - { FR_US, 1000 }, - { FR_NS, 1000 }, - { 0, 0 } -}; + {FR_DAY, 1}, {FR_HR, 24}, {FR_MIN, 60}, {FR_SEC, 60}, + {FR_MS, 1000}, {FR_US, 1000}, {FR_NS, 1000}, {0, 0}}; -static npy_int64** daytime_conversion_factor_matrix = NULL; +static npy_int64 **daytime_conversion_factor_matrix = NULL; -PANDAS_INLINE int max_value(int a, int b) { - return a > b ? a : b; -} +PANDAS_INLINE int max_value(int a, int b) { return a > b ? a : b; } -PANDAS_INLINE int min_value(int a, int b) { - return a < b ? a : b; -} +PANDAS_INLINE int min_value(int a, int b) { return a < b ? a : b; } -PANDAS_INLINE int get_freq_group(int freq) { - return (freq/1000)*1000; -} +PANDAS_INLINE int get_freq_group(int freq) { return (freq / 1000) * 1000; } -PANDAS_INLINE int get_freq_group_index(int freq) { - return freq/1000; -} +PANDAS_INLINE int get_freq_group_index(int freq) { return freq / 1000; } static int calc_conversion_factors_matrix_size(void) { int matrix_size = 0; int index; - for (index=0;; index++) { - int period_value = get_freq_group_index(daytime_conversion_factors[index][0]); + for (index = 0;; index++) { + int period_value = + get_freq_group_index(daytime_conversion_factors[index][0]); if (period_value == 0) { break; } @@ -313,9 +285,11 @@ static int calc_conversion_factors_matrix_size(void) { static void alloc_conversion_factors_matrix(int matrix_size) { int row_index; int column_index; - daytime_conversion_factor_matrix = malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); + daytime_conversion_factor_matrix = + malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); for (row_index = 0; row_index < matrix_size; row_index++) { - daytime_conversion_factor_matrix[row_index] = malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); + daytime_conversion_factor_matrix[row_index] = + malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); for (column_index = 0; column_index < matrix_size; column_index++) { daytime_conversion_factor_matrix[row_index][column_index] = 0; } @@ -325,7 +299,7 @@ static void alloc_conversion_factors_matrix(int matrix_size) { static npy_int64 calculate_conversion_factor(int start_value, int end_value) { npy_int64 conversion_factor = 0; int index; - for (index=0;; index++) { + for (index = 0;; index++) { int freq_group = daytime_conversion_factors[index][0]; if (freq_group == 0) { @@ -348,11 +322,11 @@ static npy_int64 calculate_conversion_factor(int start_value, int end_value) { static void populate_conversion_factors_matrix(void) { int row_index_index; - int row_value, row_index; + int row_value, row_index; int column_index_index; - int column_value, column_index; + int column_value, column_index; - for (row_index_index = 0;; row_index_index++) { + for (row_index_index = 0;; row_index_index++) { row_value = daytime_conversion_factors[row_index_index][0]; if (row_value == 0) { break; @@ -365,7 +339,8 @@ static void populate_conversion_factors_matrix(void) { } column_index = get_freq_group_index(column_value); - daytime_conversion_factor_matrix[row_index][column_index] = calculate_conversion_factor(row_value, column_value); + daytime_conversion_factor_matrix[row_index][column_index] = + calculate_conversion_factor(row_value, column_value); } } } @@ -378,13 +353,14 @@ void initialize_daytime_conversion_factor_matrix() { } } -PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, int to_index) -{ - return daytime_conversion_factor_matrix[min_value(from_index, to_index)][max_value(from_index, to_index)]; +PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, + int to_index) { + return daytime_conversion_factor_matrix[min_value(from_index, to_index)] + [max_value(from_index, to_index)]; } -PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, asfreq_info *af_info, int atEnd) -{ +PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, + asfreq_info *af_info, int atEnd) { if (atEnd) { return (ordinal + 1) * af_info->intraday_conversion_factor - 1; } else { @@ -392,14 +368,19 @@ PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, asfreq_info *af_info } } -PANDAS_INLINE npy_int64 downsample_daytime(npy_int64 ordinal, asfreq_info *af_info, int atEnd) -{ +PANDAS_INLINE npy_int64 downsample_daytime(npy_int64 ordinal, + asfreq_info *af_info, int atEnd) { return ordinal / (af_info->intraday_conversion_factor); } -PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, char relation, asfreq_info *af_info, freq_conv_func first_func, freq_conv_func second_func) { - //printf("transform_via_day(%ld, %ld, %d)\n", ordinal, af_info->intraday_conversion_factor, af_info->intraday_conversion_upsample); - npy_int64 result; +PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, char relation, + asfreq_info *af_info, + freq_conv_func first_func, + freq_conv_func second_func) { + // printf("transform_via_day(%ld, %ld, %d)\n", ordinal, + // af_info->intraday_conversion_factor, + // af_info->intraday_conversion_upsample); + npy_int64 result; result = (*first_func)(ordinal, relation, af_info); result = (*second_func)(result, relation, af_info); @@ -413,7 +394,7 @@ static npy_int64 DtoB_weekday(npy_int64 absdate) { static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { if (day_of_week > 4) { - //change to Monday after weekend + // change to Monday after weekend absdate += (7 - day_of_week); } return DtoB_weekday(absdate); @@ -421,7 +402,7 @@ static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { if (day_of_week > 4) { - //change to friday before weekend + // change to friday before weekend absdate -= (day_of_week - 4); } return DtoB_weekday(absdate); @@ -429,7 +410,8 @@ static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { static npy_int64 absdate_from_ymd(int y, int m, int d) { struct date_info tempDate; - if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, GREGORIAN_CALENDAR)) { + if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, + GREGORIAN_CALENDAR)) { return INT_ERR_CODE; } return tempDate.absdate; @@ -437,27 +419,33 @@ static npy_int64 absdate_from_ymd(int y, int m, int d) { //************ FROM DAILY *************** -static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); - } - else { + } else { return (npy_int64)(dinfo.year - BASE_YEAR); } } -static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { +static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, + int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; - if (dinfo.month <= 0) { dinfo.month += 12; } - else { dinfo.year += 1; } + if (dinfo.month <= 0) { + dinfo.month += 12; + } else { + dinfo.year += 1; + } dinfo.quarter = monthToQuarter(dinfo.month); } @@ -467,7 +455,8 @@ static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int return 0; } -static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { int year, quarter; ordinal = downsample_daytime(ordinal, af_info, 0); @@ -479,27 +468,33 @@ static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, asfreq_info *af_ return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); } -static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } -static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { ordinal = downsample_daytime(ordinal, af_info, 0); - return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end))/7 + 1 - WEEK_OFFSET; + return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end)) / 7 + 1 - + WEEK_OFFSET; } -static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; if (relation == 'S') { @@ -510,43 +505,54 @@ static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, asfreq_info *af_ } // all intra day calculations are now done within one function -static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, char relation, + asfreq_info *af_info) { return downsample_daytime(ordinal, af_info, relation == 'E'); } -static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, char relation, + asfreq_info *af_info) { return upsample_daytime(ordinal, af_info, relation == 'E'); } //************ FROM BUSINESS *************** -static npy_int64 asfreq_BtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) -{ +static npy_int64 asfreq_BtoDT(npy_int64 ordinal, char relation, + asfreq_info *af_info) { ordinal += BDAY_OFFSET; - ordinal = (((ordinal - 1) / 5) * 7 + - mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); + ordinal = + (((ordinal - 1) / 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); return upsample_daytime(ordinal, af_info, relation != 'S'); } -static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoA); +static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, + asfreq_DTtoA); } -static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoQ); +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, + asfreq_DTtoQ); } -static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoM); +static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, + asfreq_DTtoM); } -static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoW); +static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, + asfreq_DTtoW); } //************ FROM WEEKLY *************** -static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, + asfreq_info *af_info) { ordinal += WEEK_OFFSET; if (relation != 'S') { ordinal += 1; @@ -561,33 +567,41 @@ static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, asfreq_info *af_ return upsample_daytime(ordinal, af_info, relation != 'S'); } -static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoA); +static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, + asfreq_DTtoA); } -static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoQ); +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, + asfreq_DTtoQ); } -static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoM); +static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, + asfreq_DTtoM); } -static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoW); +static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, + asfreq_DTtoW); } -static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { - +static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, - asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) + return INT_ERR_CODE; if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } - else { + } else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } } @@ -598,46 +612,58 @@ static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { *m = mod_compat(ordinal, 12) + 1; } - -static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, asfreq_info* af_info) { +static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, + asfreq_info *af_info) { npy_int64 absdate; int y, m; if (relation == 'E') { - ordinal += 1; + ordinal += 1; } MtoD_ym(ordinal, &y, &m); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) + return INT_ERR_CODE; ordinal = absdate - ORD_OFFSET; if (relation == 'E') { - ordinal -= 1; + ordinal -= 1; } return upsample_daytime(ordinal, af_info, relation != 'S'); } -static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoA); +static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, + asfreq_DTtoA); } -static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoQ); +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, + asfreq_DTtoQ); } -static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoW); +static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, + asfreq_DTtoW); } -static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; - - if (dInfoCalc_SetFromAbsDate(&dinfo, - asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; - if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } - else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } + if (dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } } //************ FROM QUARTERLY *************** @@ -648,62 +674,78 @@ static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { if (af_info->from_q_year_end != 12) { *m += af_info->from_q_year_end; - if (*m > 12) { *m -= 12; } - else { *y -= 1; } + if (*m > 12) { + *m -= 12; + } else { + *y -= 1; + } } } -static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) { - +static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, + asfreq_info *af_info) { npy_int64 absdate; int y, m; if (relation == 'E') { - ordinal += 1; + ordinal += 1; } QtoD_ym(ordinal, &y, &m, af_info); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) + return INT_ERR_CODE; if (relation == 'E') { - absdate -= 1; + absdate -= 1; } return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); } -static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoQ); +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, + asfreq_DTtoQ); } -static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoA); +static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, + asfreq_DTtoA); } -static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoM); +static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, + asfreq_DTtoM); } -static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoW); +static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, + asfreq_DTtoW); } -static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { - +static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, - asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) + return INT_ERR_CODE; - if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } - else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } } - //************ FROM ANNUAL *************** -static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, asfreq_info *af_info) { +static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, + asfreq_info *af_info) { npy_int64 absdate; int month = (af_info->from_a_year_end) % 12; @@ -713,164 +755,193 @@ static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, asfreq_info *af_inf month += 1; if (af_info->from_a_year_end != 12) { - year -= 1; + year -= 1; } if (relation == 'E') { - year += 1; + year += 1; } absdate = absdate_from_ymd(year, month, 1); - if (absdate == INT_ERR_CODE) { + if (absdate == INT_ERR_CODE) { return INT_ERR_CODE; } if (relation == 'E') { - absdate -= 1; + absdate -= 1; } return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); } -static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoA); +static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, + asfreq_DTtoA); } -static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoQ); +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, + asfreq_DTtoQ); } -static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoM); +static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, + asfreq_DTtoM); } -static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoW); +static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, + asfreq_DTtoW); } -static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { - +static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, + asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, - asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) + return INT_ERR_CODE; - if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } - else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } } -static npy_int64 nofunc(npy_int64 ordinal, char relation, asfreq_info *af_info) { return INT_ERR_CODE; } -static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } +static npy_int64 nofunc(npy_int64 ordinal, char relation, + asfreq_info *af_info) { + return INT_ERR_CODE; +} +static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal; +} // end of frequency specific conversion routines static int calc_a_year_end(int freq, int group) { int result = (freq - group) % 12; - if (result == 0) {return 12;} - else {return result;} + if (result == 0) { + return 12; + } else { + return result; + } } -static int calc_week_end(int freq, int group) { - return freq - group; -} +static int calc_week_end(int freq, int group) { return freq - group; } void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); - af_info->intraday_conversion_factor = - get_daytime_conversion_factor( - get_freq_group_index(max_value(fromGroup, FR_DAY)), - get_freq_group_index(max_value(toGroup, FR_DAY)) - ); + af_info->intraday_conversion_factor = get_daytime_conversion_factor( + get_freq_group_index(max_value(fromGroup, FR_DAY)), + get_freq_group_index(max_value(toGroup, FR_DAY))); - //printf("get_asfreq_info(%d, %d) %ld, %d\n", fromFreq, toFreq, af_info->intraday_conversion_factor, af_info->intraday_conversion_upsample); + // printf("get_asfreq_info(%d, %d) %ld, %d\n", fromFreq, toFreq, + // af_info->intraday_conversion_factor, + // af_info->intraday_conversion_upsample); - switch(fromGroup) - { - case FR_WK: + switch (fromGroup) { + case FR_WK: af_info->from_week_end = calc_week_end(fromFreq, fromGroup); break; - case FR_ANN: + case FR_ANN: af_info->from_a_year_end = calc_a_year_end(fromFreq, fromGroup); break; - case FR_QTR: + case FR_QTR: af_info->from_q_year_end = calc_a_year_end(fromFreq, fromGroup); break; } - switch(toGroup) - { - case FR_WK: + switch (toGroup) { + case FR_WK: af_info->to_week_end = calc_week_end(toFreq, toGroup); break; - case FR_ANN: + case FR_ANN: af_info->to_a_year_end = calc_a_year_end(toFreq, toGroup); break; - case FR_QTR: + case FR_QTR: af_info->to_q_year_end = calc_a_year_end(toFreq, toGroup); break; } } - -freq_conv_func get_asfreq_func(int fromFreq, int toFreq) -{ +freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); - if (fromGroup == FR_UND) { fromGroup = FR_DAY; } + if (fromGroup == FR_UND) { + fromGroup = FR_DAY; + } - switch(fromGroup) - { + switch (fromGroup) { case FR_ANN: - switch(toGroup) - { - case FR_ANN: return &asfreq_AtoA; - case FR_QTR: return &asfreq_AtoQ; - case FR_MTH: return &asfreq_AtoM; - case FR_WK: return &asfreq_AtoW; - case FR_BUS: return &asfreq_AtoB; - case FR_DAY: - case FR_HR: - case FR_MIN: + switch (toGroup) { + case FR_ANN: + return &asfreq_AtoA; + case FR_QTR: + return &asfreq_AtoQ; + case FR_MTH: + return &asfreq_AtoM; + case FR_WK: + return &asfreq_AtoW; + case FR_BUS: + return &asfreq_AtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: case FR_SEC: case FR_MS: case FR_US: case FR_NS: - return &asfreq_AtoDT; + return &asfreq_AtoDT; - default: return &nofunc; + default: + return &nofunc; } case FR_QTR: - switch(toGroup) - { - case FR_ANN: return &asfreq_QtoA; - case FR_QTR: return &asfreq_QtoQ; - case FR_MTH: return &asfreq_QtoM; - case FR_WK: return &asfreq_QtoW; - case FR_BUS: return &asfreq_QtoB; - case FR_DAY: + switch (toGroup) { + case FR_ANN: + return &asfreq_QtoA; + case FR_QTR: + return &asfreq_QtoQ; + case FR_MTH: + return &asfreq_QtoM; + case FR_WK: + return &asfreq_QtoW; + case FR_BUS: + return &asfreq_QtoB; + case FR_DAY: case FR_HR: case FR_MIN: case FR_SEC: case FR_MS: case FR_US: case FR_NS: - return &asfreq_QtoDT; - default: return &nofunc; + return &asfreq_QtoDT; + default: + return &nofunc; } case FR_MTH: - switch(toGroup) - { - case FR_ANN: return &asfreq_MtoA; - case FR_QTR: return &asfreq_MtoQ; - case FR_MTH: return &no_op; - case FR_WK: return &asfreq_MtoW; - case FR_BUS: return &asfreq_MtoB; + switch (toGroup) { + case FR_ANN: + return &asfreq_MtoA; + case FR_QTR: + return &asfreq_MtoQ; + case FR_MTH: + return &no_op; + case FR_WK: + return &asfreq_MtoW; + case FR_BUS: + return &asfreq_MtoB; case FR_DAY: case FR_HR: case FR_MIN: @@ -878,46 +949,57 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) case FR_MS: case FR_US: case FR_NS: - return &asfreq_MtoDT; - default: return &nofunc; + return &asfreq_MtoDT; + default: + return &nofunc; } case FR_WK: - switch(toGroup) - { - case FR_ANN: return &asfreq_WtoA; - case FR_QTR: return &asfreq_WtoQ; - case FR_MTH: return &asfreq_WtoM; - case FR_WK: return &asfreq_WtoW; - case FR_BUS: return &asfreq_WtoB; - case FR_DAY: - case FR_HR: - case FR_MIN: - case FR_SEC: + switch (toGroup) { + case FR_ANN: + return &asfreq_WtoA; + case FR_QTR: + return &asfreq_WtoQ; + case FR_MTH: + return &asfreq_WtoM; + case FR_WK: + return &asfreq_WtoW; + case FR_BUS: + return &asfreq_WtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: case FR_MS: case FR_US: case FR_NS: - return &asfreq_WtoDT; - default: return &nofunc; + return &asfreq_WtoDT; + default: + return &nofunc; } case FR_BUS: - switch(toGroup) - { - case FR_ANN: return &asfreq_BtoA; - case FR_QTR: return &asfreq_BtoQ; - case FR_MTH: return &asfreq_BtoM; - case FR_WK: return &asfreq_BtoW; - case FR_BUS: return &no_op; - case FR_DAY: - case FR_HR: - case FR_MIN: + switch (toGroup) { + case FR_ANN: + return &asfreq_BtoA; + case FR_QTR: + return &asfreq_BtoQ; + case FR_MTH: + return &asfreq_BtoM; + case FR_WK: + return &asfreq_BtoW; + case FR_BUS: + return &no_op; + case FR_DAY: + case FR_HR: + case FR_MIN: case FR_SEC: case FR_MS: case FR_US: case FR_NS: - return &asfreq_BtoDT; - default: return &nofunc; + return &asfreq_BtoDT; + default: + return &nofunc; } case FR_DAY: @@ -927,14 +1009,18 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) case FR_MS: case FR_US: case FR_NS: - switch(toGroup) - { - case FR_ANN: return &asfreq_DTtoA; - case FR_QTR: return &asfreq_DTtoQ; - case FR_MTH: return &asfreq_DTtoM; - case FR_WK: return &asfreq_DTtoW; - case FR_BUS: return &asfreq_DTtoB; - case FR_DAY: + switch (toGroup) { + case FR_ANN: + return &asfreq_DTtoA; + case FR_QTR: + return &asfreq_DTtoQ; + case FR_MTH: + return &asfreq_DTtoM; + case FR_WK: + return &asfreq_DTtoW; + case FR_BUS: + return &asfreq_DTtoB; + case FR_DAY: case FR_HR: case FR_MIN: case FR_SEC: @@ -946,59 +1032,60 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) } else { return &asfreq_UpsampleWithinDay; } - default: return &nofunc; + default: + return &nofunc; } - default: return &nofunc; + default: + return &nofunc; } } double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { - //printf("get_abs_time %d %lld %lld\n", freq, date_ordinal, ordinal); + // printf("get_abs_time %d %lld %lld\n", freq, date_ordinal, ordinal); - int freq_index, day_index, base_index; - npy_int64 per_day, start_ord; - double unit, result; + int freq_index, day_index, base_index; + npy_int64 per_day, start_ord; + double unit, result; if (freq <= FR_DAY) { - return 0; + return 0; } freq_index = get_freq_group_index(freq); day_index = get_freq_group_index(FR_DAY); base_index = get_freq_group_index(FR_SEC); - //printf(" indices: day %d, freq %d, base %d\n", day_index, freq_index, base_index); + // printf(" indices: day %d, freq %d, base %d\n", day_index, freq_index, + // base_index); per_day = get_daytime_conversion_factor(day_index, freq_index); unit = get_daytime_conversion_factor(freq_index, base_index); - //printf(" per_day: %lld, unit: %f\n", per_day, unit); + // printf(" per_day: %lld, unit: %f\n", per_day, unit); if (base_index < freq_index) { - unit = 1 / unit; - //printf(" corrected unit: %f\n", unit); + unit = 1 / unit; + // printf(" corrected unit: %f\n", unit); } start_ord = date_ordinal * per_day; - //printf("start_ord: %lld\n", start_ord); - result = (double) ( unit * (ordinal - start_ord)); - //printf(" result: %f\n", result); + // printf("start_ord: %lld\n", start_ord); + result = (double)(unit * (ordinal - start_ord)); + // printf(" result: %f\n", result); return result; } /* Sets the time part of the DateTime object. */ -static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, - double abstime) -{ +static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { int inttime; - int hour,minute; + int hour, minute; double second; inttime = (int)abstime; hour = inttime / 3600; minute = (inttime % 3600) / 60; - second = abstime - (double)(hour*3600 + minute*60); + second = abstime - (double)(hour * 3600 + minute * 60); dinfo->hour = hour; dinfo->minute = minute; @@ -1013,15 +1100,12 @@ static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar to be used. */ static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - npy_int64 absdate, - double abstime, - int calendar) -{ + npy_int64 absdate, double abstime, + int calendar) { /* Bounds check */ Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, - PyExc_ValueError, - "abstime out of range (0.0 - 86400.0): %f", - abstime); + PyExc_ValueError, + "abstime out of range (0.0 - 86400.0): %f", abstime); /* Calculate the date */ if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; @@ -1038,8 +1122,8 @@ static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, * New pandas API-helper code, to expose to cython * ------------------------------------------------------------------*/ -npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) -{ +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, + char relation) { npy_int64 val; freq_conv_func func; asfreq_info finfo; @@ -1048,12 +1132,14 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) get_asfreq_info(freq1, freq2, &finfo); - //printf("\n%x %d %d %ld %ld\n", func, freq1, freq2, finfo.intraday_conversion_factor, -finfo.intraday_conversion_factor); + // printf("\n%x %d %d %ld %ld\n", func, freq1, freq2, + // finfo.intraday_conversion_factor, -finfo.intraday_conversion_factor); val = (*func)(period_ordinal, relation, &finfo); if (val == INT_ERR_CODE) { - //Py_Error(PyExc_ValueError, "Unable to convert to desired frequency."); + // Py_Error(PyExc_ValueError, "Unable to convert to desired + // frequency."); goto onError; } return val; @@ -1061,12 +1147,10 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) return INT_ERR_CODE; } - /* generate an ordinal in period space */ -npy_int64 get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, int microseconds, int picoseconds, - int freq) -{ +npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, + int second, int microseconds, int picoseconds, + int freq) { npy_int64 absdays, delta, seconds; npy_int64 weeks, days; npy_int64 ordinal, day_adj; @@ -1074,20 +1158,21 @@ npy_int64 get_period_ordinal(int year, int month, int day, freq_group = get_freq_group(freq); if (freq == FR_SEC || freq == FR_MS || freq == FR_US || freq == FR_NS) { - absdays = absdate_from_ymd(year, month, day); delta = (absdays - ORD_OFFSET); - seconds = (npy_int64)(delta * 86400 + hour * 3600 + minute * 60 + second); + seconds = + (npy_int64)(delta * 86400 + hour * 3600 + minute * 60 + second); - switch(freq) { - case FR_MS: - return seconds * 1000 + microseconds / 1000; + switch (freq) { + case FR_MS: + return seconds * 1000 + microseconds / 1000; - case FR_US: - return seconds * 1000000 + microseconds; + case FR_US: + return seconds * 1000000 + microseconds; - case FR_NS: - return seconds * 1000000000 + microseconds * 1000 + picoseconds / 1000; + case FR_NS: + return seconds * 1000000000 + microseconds * 1000 + + picoseconds / 1000; } return seconds; @@ -1096,63 +1181,55 @@ npy_int64 get_period_ordinal(int year, int month, int day, if (freq == FR_MIN) { absdays = absdate_from_ymd(year, month, day); delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta*1440 + hour*60 + minute); + return (npy_int64)(delta * 1440 + hour * 60 + minute); } if (freq == FR_HR) { - if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) - { + if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { goto onError; } delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta*24 + hour); + return (npy_int64)(delta * 24 + hour); } - if (freq == FR_DAY) - { - return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + if (freq == FR_DAY) { + return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); } - if (freq == FR_UND) - { - return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + if (freq == FR_UND) { + return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); } - if (freq == FR_BUS) - { - if((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) - { + if (freq == FR_BUS) { + if ((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { goto onError; } // calculate the current week assuming sunday as last day of a week weeks = (days - BASE_WEEK_TO_DAY_OFFSET) / DAYS_PER_WEEK; // calculate the current weekday (in range 1 .. 7) delta = (days - BASE_WEEK_TO_DAY_OFFSET) % DAYS_PER_WEEK + 1; - // return the number of business days in full weeks plus the business days in the last - possible partial - week - return (npy_int64)(weeks * BUSINESS_DAYS_PER_WEEK) - + (delta <= BUSINESS_DAYS_PER_WEEK - ? delta - : BUSINESS_DAYS_PER_WEEK + 1) - - BDAY_OFFSET; + // return the number of business days in full weeks plus the business + // days in the last - possible partial - week + return (npy_int64)(weeks * BUSINESS_DAYS_PER_WEEK) + + (delta <= BUSINESS_DAYS_PER_WEEK ? delta + : BUSINESS_DAYS_PER_WEEK + 1) - + BDAY_OFFSET; } - if (freq_group == FR_WK) - { - if((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == INT_ERR_CODE) - { + if (freq_group == FR_WK) { + if ((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == + INT_ERR_CODE) { goto onError; } day_adj = freq - FR_WK; return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; } - if (freq == FR_MTH) - { + if (freq == FR_MTH) { return (year - BASE_YEAR) * 12 + month - 1; } - if (freq_group == FR_QTR) - { + if (freq_group == FR_QTR) { fmonth = freq - FR_QTR; if (fmonth == 0) fmonth = 12; @@ -1163,14 +1240,12 @@ npy_int64 get_period_ordinal(int year, int month, int day, return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; } - if (freq_group == FR_ANN) - { + if (freq_group == FR_ANN) { fmonth = freq - FR_ANN; if (fmonth == 0) fmonth = 12; if (month <= fmonth) { return year - BASE_YEAR; - } - else { + } else { return year - BASE_YEAR + 1; } } @@ -1188,13 +1263,11 @@ npy_int64 get_period_ordinal(int year, int month, int day, is calculated for the last day of the period. */ -npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) -{ +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { asfreq_info af_info; - freq_conv_func toDaily = NULL; + freq_conv_func toDaily = NULL; - if (freq == FR_DAY) - return period_ordinal + ORD_OFFSET; + if (freq == FR_DAY) return period_ordinal + ORD_OFFSET; toDaily = get_asfreq_func(freq, FR_DAY); get_asfreq_info(freq, FR_DAY, &af_info); @@ -1216,12 +1289,14 @@ char *str_replace(const char *s, const char *old, const char *new) { } ret = PyArray_malloc(i + 1 + count * (newlen - oldlen)); - if (ret == NULL) {return (char *)PyErr_NoMemory();} + if (ret == NULL) { + return (char *)PyErr_NoMemory(); + } i = 0; while (*s) { if (strstr(s, old) == s) { - strcpy(&ret[i], new); + strncpy(&ret[i], new, sizeof(char) * newlen); i += newlen; s += oldlen; } else { @@ -1236,9 +1311,9 @@ char *str_replace(const char *s, const char *old, const char *new) { // function to generate a nice string representation of the period // object, originally from DateObject_strftime -char* c_strftime(struct date_info *tmp, char *fmt) { +char *c_strftime(struct date_info *tmp, char *fmt) { struct tm c_date; - char* result; + char *result; struct date_info dinfo = *tmp; int result_len = strlen(fmt) + 50; @@ -1263,7 +1338,7 @@ int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { asfreq_info af_info; int qtr_freq; npy_int64 daily_ord; - npy_int64 (*toDaily)(npy_int64, char, asfreq_info*) = NULL; + npy_int64 (*toDaily)(npy_int64, char, asfreq_info *) = NULL; toDaily = get_asfreq_func(freq, FR_DAY); get_asfreq_info(freq, FR_DAY, &af_info); @@ -1272,19 +1347,16 @@ int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { if (get_freq_group(freq) == FR_QTR) { qtr_freq = freq; - } else { qtr_freq = FR_QTR; } + } else { + qtr_freq = FR_QTR; + } get_asfreq_info(FR_DAY, qtr_freq, &af_info); - if(DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) - return -1; + if (DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) return -1; return 0; } - - - - static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { asfreq_info af_info; int qtr_freq; @@ -1301,31 +1373,29 @@ static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { if (DtoQ_yq(ordinal, &af_info, year, quarter) == INT_ERR_CODE) return INT_ERR_CODE; - if ((qtr_freq % 1000) > 12) - *year -= 1; + if ((qtr_freq % 1000) > 12) *year -= 1; return 0; } -static int _ISOWeek(struct date_info *dinfo) -{ +static int _ISOWeek(struct date_info *dinfo) { int week; /* Estimate */ - week = (dinfo->day_of_year-1) - dinfo->day_of_week + 3; + week = (dinfo->day_of_year - 1) - dinfo->day_of_week + 3; if (week >= 0) week = week / 7 + 1; /* Verify */ if (week < 0) { /* The day lies in last week of the previous year */ - if ((week > -2) || - (week == -2 && dInfoCalc_Leapyear(dinfo->year-1, dinfo->calendar))) + if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1, + dinfo->calendar))) week = 53; else week = 52; } else if (week == 53) { /* Check if the week belongs to year or year+1 */ - if (31-dinfo->day + dinfo->day_of_week < 3) { + if (31 - dinfo->day + dinfo->day_of_week < 3) { week = 1; } } @@ -1333,8 +1403,7 @@ static int _ISOWeek(struct date_info *dinfo) return week; } -int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) -{ +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { npy_int64 absdate = get_python_ordinal(ordinal, freq); double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); @@ -1344,11 +1413,11 @@ int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) } while (abstime >= 86400) { abstime -= 86400; - absdate += 1; + absdate += 1; } - if(dInfoCalc_SetFromAbsDateTime(dinfo, absdate, - abstime, GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; return 0; @@ -1362,77 +1431,77 @@ int pyear(npy_int64 ordinal, int freq) { int pqyear(npy_int64 ordinal, int freq) { int year, quarter; - if( _quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) return INT_ERR_CODE; return year; } int pquarter(npy_int64 ordinal, int freq) { int year, quarter; - if(_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) return INT_ERR_CODE; return quarter; } int pmonth(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.month; } int pday(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day; } int pweekday(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_week; } int pday_of_week(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_week; } int pday_of_year(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.day_of_year; } int pweek(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return _ISOWeek(&dinfo); } int phour(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.hour; } int pminute(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return dinfo.minute; } int psecond(npy_int64 ordinal, int freq) { struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; return (int)dinfo.second; } @@ -1440,9 +1509,10 @@ int psecond(npy_int64 ordinal, int freq) { int pdays_in_month(npy_int64 ordinal, int freq) { int days; struct date_info dinfo; - if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; - - days = days_in_month[dInfoCalc_Leapyear(dinfo.year, dinfo.calendar)][dinfo.month-1]; + + days = days_in_month[dInfoCalc_Leapyear(dinfo.year, dinfo.calendar)] + [dinfo.month - 1]; return days; } diff --git a/pandas/src/period_helper.h b/pandas/src/period_helper.h index 0351321926fa2..601717692ff6d 100644 --- a/pandas/src/period_helper.h +++ b/pandas/src/period_helper.h @@ -1,17 +1,24 @@ /* - * Borrowed and derived code from scikits.timeseries that we will expose via - * Cython to pandas. This primarily concerns interval representation and - * frequency conversion routines. - */ +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. -#ifndef C_PERIOD_H -#define C_PERIOD_H +Borrowed and derived code from scikits.timeseries that we will expose via +Cython to pandas. This primarily concerns interval representation and +frequency conversion routines. +*/ + +#ifndef PANDAS_SRC_PERIOD_HELPER_H_ +#define PANDAS_SRC_PERIOD_HELPER_H_ #include -#include "helper.h" -#include "numpy/ndarraytypes.h" #include "headers/stdint.h" +#include "helper.h" #include "limits.h" +#include "numpy/ndarraytypes.h" /* * declarations from period here @@ -20,100 +27,113 @@ #define GREGORIAN_CALENDAR 0 #define JULIAN_CALENDAR 1 -#define SECONDS_PER_DAY ((double) 86400.0) - -#define Py_AssertWithArg(x,errortype,errorstr,a1) {if (!(x)) {PyErr_Format(errortype,errorstr,a1);goto onError;}} -#define Py_Error(errortype,errorstr) {PyErr_SetString(errortype,errorstr);goto onError;} +#define SECONDS_PER_DAY ((double)86400.0) + +#define Py_AssertWithArg(x, errortype, errorstr, a1) \ + { \ + if (!(x)) { \ + PyErr_Format(errortype, errorstr, a1); \ + goto onError; \ + } \ + } +#define Py_Error(errortype, errorstr) \ + { \ + PyErr_SetString(errortype, errorstr); \ + goto onError; \ + } /*** FREQUENCY CONSTANTS ***/ // HIGHFREQ_ORIG is the datetime ordinal from which to begin the second // frequency ordinal sequence -// typedef int64_t npy_int64; -// begins second ordinal at 1/1/1970 unix epoch - // #define HIGHFREQ_ORIG 62135683200LL #define BASE_YEAR 1970 -#define ORD_OFFSET 719163LL // days until 1970-01-01 -#define BDAY_OFFSET 513689LL // days until 1970-01-01 +#define ORD_OFFSET 719163LL // days until 1970-01-01 +#define BDAY_OFFSET 513689LL // days until 1970-01-01 #define WEEK_OFFSET 102737LL -#define BASE_WEEK_TO_DAY_OFFSET 1 // difference between day 0 and end of week in days +#define BASE_WEEK_TO_DAY_OFFSET \ + 1 // difference between day 0 and end of week in days #define DAYS_PER_WEEK 7 #define BUSINESS_DAYS_PER_WEEK 5 -#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 - -#define FR_ANN 1000 /* Annual */ -#define FR_ANNDEC FR_ANN /* Annual - December year end*/ -#define FR_ANNJAN 1001 /* Annual - January year end*/ -#define FR_ANNFEB 1002 /* Annual - February year end*/ -#define FR_ANNMAR 1003 /* Annual - March year end*/ -#define FR_ANNAPR 1004 /* Annual - April year end*/ -#define FR_ANNMAY 1005 /* Annual - May year end*/ -#define FR_ANNJUN 1006 /* Annual - June year end*/ -#define FR_ANNJUL 1007 /* Annual - July year end*/ -#define FR_ANNAUG 1008 /* Annual - August year end*/ -#define FR_ANNSEP 1009 /* Annual - September year end*/ -#define FR_ANNOCT 1010 /* Annual - October year end*/ -#define FR_ANNNOV 1011 /* Annual - November year end*/ +#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 + +#define FR_ANN 1000 /* Annual */ +#define FR_ANNDEC FR_ANN /* Annual - December year end*/ +#define FR_ANNJAN 1001 /* Annual - January year end*/ +#define FR_ANNFEB 1002 /* Annual - February year end*/ +#define FR_ANNMAR 1003 /* Annual - March year end*/ +#define FR_ANNAPR 1004 /* Annual - April year end*/ +#define FR_ANNMAY 1005 /* Annual - May year end*/ +#define FR_ANNJUN 1006 /* Annual - June year end*/ +#define FR_ANNJUL 1007 /* Annual - July year end*/ +#define FR_ANNAUG 1008 /* Annual - August year end*/ +#define FR_ANNSEP 1009 /* Annual - September year end*/ +#define FR_ANNOCT 1010 /* Annual - October year end*/ +#define FR_ANNNOV 1011 /* Annual - November year end*/ /* The standard quarterly frequencies with various fiscal year ends eg, Q42005 for Q@OCT runs Aug 1, 2005 to Oct 31, 2005 */ -#define FR_QTR 2000 /* Quarterly - December year end (default quarterly) */ -#define FR_QTRDEC FR_QTR /* Quarterly - December year end */ -#define FR_QTRJAN 2001 /* Quarterly - January year end */ -#define FR_QTRFEB 2002 /* Quarterly - February year end */ -#define FR_QTRMAR 2003 /* Quarterly - March year end */ -#define FR_QTRAPR 2004 /* Quarterly - April year end */ -#define FR_QTRMAY 2005 /* Quarterly - May year end */ -#define FR_QTRJUN 2006 /* Quarterly - June year end */ -#define FR_QTRJUL 2007 /* Quarterly - July year end */ -#define FR_QTRAUG 2008 /* Quarterly - August year end */ -#define FR_QTRSEP 2009 /* Quarterly - September year end */ -#define FR_QTROCT 2010 /* Quarterly - October year end */ -#define FR_QTRNOV 2011 /* Quarterly - November year end */ - -#define FR_MTH 3000 /* Monthly */ - -#define FR_WK 4000 /* Weekly */ +#define FR_QTR 2000 /* Quarterly - December year end (default quarterly) */ +#define FR_QTRDEC FR_QTR /* Quarterly - December year end */ +#define FR_QTRJAN 2001 /* Quarterly - January year end */ +#define FR_QTRFEB 2002 /* Quarterly - February year end */ +#define FR_QTRMAR 2003 /* Quarterly - March year end */ +#define FR_QTRAPR 2004 /* Quarterly - April year end */ +#define FR_QTRMAY 2005 /* Quarterly - May year end */ +#define FR_QTRJUN 2006 /* Quarterly - June year end */ +#define FR_QTRJUL 2007 /* Quarterly - July year end */ +#define FR_QTRAUG 2008 /* Quarterly - August year end */ +#define FR_QTRSEP 2009 /* Quarterly - September year end */ +#define FR_QTROCT 2010 /* Quarterly - October year end */ +#define FR_QTRNOV 2011 /* Quarterly - November year end */ + +#define FR_MTH 3000 /* Monthly */ + +#define FR_WK 4000 /* Weekly */ #define FR_WKSUN FR_WK /* Weekly - Sunday end of week */ -#define FR_WKMON 4001 /* Weekly - Monday end of week */ -#define FR_WKTUE 4002 /* Weekly - Tuesday end of week */ -#define FR_WKWED 4003 /* Weekly - Wednesday end of week */ -#define FR_WKTHU 4004 /* Weekly - Thursday end of week */ -#define FR_WKFRI 4005 /* Weekly - Friday end of week */ -#define FR_WKSAT 4006 /* Weekly - Saturday end of week */ - -#define FR_BUS 5000 /* Business days */ -#define FR_DAY 6000 /* Daily */ -#define FR_HR 7000 /* Hourly */ -#define FR_MIN 8000 /* Minutely */ -#define FR_SEC 9000 /* Secondly */ -#define FR_MS 10000 /* Millisecondly */ -#define FR_US 11000 /* Microsecondly */ -#define FR_NS 12000 /* Nanosecondly */ - -#define FR_UND -10000 /* Undefined */ +#define FR_WKMON 4001 /* Weekly - Monday end of week */ +#define FR_WKTUE 4002 /* Weekly - Tuesday end of week */ +#define FR_WKWED 4003 /* Weekly - Wednesday end of week */ +#define FR_WKTHU 4004 /* Weekly - Thursday end of week */ +#define FR_WKFRI 4005 /* Weekly - Friday end of week */ +#define FR_WKSAT 4006 /* Weekly - Saturday end of week */ + +#define FR_BUS 5000 /* Business days */ +#define FR_DAY 6000 /* Daily */ +#define FR_HR 7000 /* Hourly */ +#define FR_MIN 8000 /* Minutely */ +#define FR_SEC 9000 /* Secondly */ +#define FR_MS 10000 /* Millisecondly */ +#define FR_US 11000 /* Microsecondly */ +#define FR_NS 12000 /* Nanosecondly */ + +#define FR_UND -10000 /* Undefined */ #define INT_ERR_CODE INT32_MIN -#define MEM_CHECK(item) if (item == NULL) { return PyErr_NoMemory(); } -#define ERR_CHECK(item) if (item == NULL) { return NULL; } +#define MEM_CHECK(item) \ + if (item == NULL) { \ + return PyErr_NoMemory(); \ + } +#define ERR_CHECK(item) \ + if (item == NULL) { \ + return NULL; \ + } typedef struct asfreq_info { - int from_week_end; // day the week ends on in the "from" frequency - int to_week_end; // day the week ends on in the "to" frequency + int from_week_end; // day the week ends on in the "from" frequency + int to_week_end; // day the week ends on in the "to" frequency - int from_a_year_end; // month the year ends on in the "from" frequency - int to_a_year_end; // month the year ends on in the "to" frequency + int from_a_year_end; // month the year ends on in the "from" frequency + int to_a_year_end; // month the year ends on in the "to" frequency - int from_q_year_end; // month the year ends on in the "from" frequency - int to_q_year_end; // month the year ends on in the "to" frequency + int from_q_year_end; // month the year ends on in the "from" frequency + int to_q_year_end; // month the year ends on in the "to" frequency npy_int64 intraday_conversion_factor; } asfreq_info; - typedef struct date_info { npy_int64 absdate; double abstime; @@ -130,7 +150,7 @@ typedef struct date_info { int calendar; } date_info; -typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info*); +typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); /* * new pandas API helper functions here @@ -138,9 +158,9 @@ typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info*); npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); -npy_int64 get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, int microseconds, int picoseconds, - int freq); +npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, + int second, int microseconds, int picoseconds, + int freq); npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); @@ -167,4 +187,5 @@ char *c_strftime(struct date_info *dinfo, char *fmt); int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); void initialize_daytime_conversion_factor_matrix(void); -#endif + +#endif // PANDAS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/src/skiplist.h b/pandas/src/skiplist.h index 3bf63aedce9cb..013516a49fa2f 100644 --- a/pandas/src/skiplist.h +++ b/pandas/src/skiplist.h @@ -1,298 +1,290 @@ - /* - Flexibly-sized, indexable skiplist data structure for maintaining a sorted - list of values +Copyright (c) 2016, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. - Port of Wes McKinney's Cython version of Raymond Hettinger's original pure - Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) - */ +Flexibly-sized, index-able skiplist data structure for maintaining a sorted +list of values -// #include -// #include +Port of Wes McKinney's Cython version of Raymond Hettinger's original pure +Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) +*/ +#ifndef PANDAS_SRC_SKIPLIST_H_ +#define PANDAS_SRC_SKIPLIST_H_ +#include #include #include #include -#include #ifndef PANDAS_INLINE - #if defined(__GNUC__) - #define PANDAS_INLINE static __inline__ - #elif defined(_MSC_VER) - #define PANDAS_INLINE static __inline - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define PANDAS_INLINE static inline - #else - #define PANDAS_INLINE - #endif +#if defined(__GNUC__) +#define PANDAS_INLINE static __inline__ +#elif defined(_MSC_VER) +#define PANDAS_INLINE static __inline +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define PANDAS_INLINE static inline +#else +#define PANDAS_INLINE +#endif #endif -PANDAS_INLINE float __skiplist_nanf(void) -{ - const union { int __i; float __f;} __bint = {0x7fc00000UL}; +PANDAS_INLINE float __skiplist_nanf(void) { + const union { + int __i; + float __f; + } __bint = {0x7fc00000UL}; return __bint.__f; } -#define PANDAS_NAN ((double) __skiplist_nanf()) +#define PANDAS_NAN ((double)__skiplist_nanf()) - -PANDAS_INLINE double Log2(double val) { - return log(val) / log(2.); -} +PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); } typedef struct node_t node_t; struct node_t { - node_t **next; - int *width; - double value; - int is_nil; - int levels; - int ref_count; + node_t **next; + int *width; + double value; + int is_nil; + int levels; + int ref_count; }; typedef struct { - node_t *head; - node_t **tmp_chain; - int *tmp_steps; - int size; - int maxlevels; + node_t *head; + node_t **tmp_chain; + int *tmp_steps; + int size; + int maxlevels; } skiplist_t; PANDAS_INLINE double urand(void) { - return ((double) rand() + 1) / ((double) RAND_MAX + 2); + return ((double)rand() + 1) / ((double)RAND_MAX + 2); } -PANDAS_INLINE int int_min(int a, int b) { - return a < b ? a : b; -} +PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; } PANDAS_INLINE node_t *node_init(double value, int levels) { - node_t *result; - result = (node_t*) malloc(sizeof(node_t)); - if (result) { - result->value = value; - result->levels = levels; - result->is_nil = 0; - result->ref_count = 0; - result->next = (node_t**) malloc(levels * sizeof(node_t*)); - result->width = (int*) malloc(levels * sizeof(int)); - if (!(result->next && result->width) && (levels != 0)) { - free(result->next); - free(result->width); - free(result); - return NULL; - } - } - return result; + node_t *result; + result = (node_t *)malloc(sizeof(node_t)); + if (result) { + result->value = value; + result->levels = levels; + result->is_nil = 0; + result->ref_count = 0; + result->next = (node_t **)malloc(levels * sizeof(node_t *)); + result->width = (int *)malloc(levels * sizeof(int)); + if (!(result->next && result->width) && (levels != 0)) { + free(result->next); + free(result->width); + free(result); + return NULL; + } + } + return result; } // do this ourselves -PANDAS_INLINE void node_incref(node_t *node) { - ++(node->ref_count); -} +PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); } -PANDAS_INLINE void node_decref(node_t *node) { - --(node->ref_count); -} +PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); } static void node_destroy(node_t *node) { - int i; - if (node) { - if (node->ref_count <= 1) { - for (i = 0; i < node->levels; ++i) { - node_destroy(node->next[i]); - } - free(node->next); - free(node->width); - // printf("Reference count was 1, freeing\n"); - free(node); - } - else { - node_decref(node); + int i; + if (node) { + if (node->ref_count <= 1) { + for (i = 0; i < node->levels; ++i) { + node_destroy(node->next[i]); + } + free(node->next); + free(node->width); + // printf("Reference count was 1, freeing\n"); + free(node); + } else { + node_decref(node); + } + // pretty sure that freeing the struct above will be enough } - // pretty sure that freeing the struct above will be enough - } } PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) { - if (skp) { - node_destroy(skp->head); - free(skp->tmp_steps); - free(skp->tmp_chain); - free(skp); - } + if (skp) { + node_destroy(skp->head); + free(skp->tmp_steps); + free(skp->tmp_chain); + free(skp); + } } PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) { - skiplist_t *result; - node_t *NIL, *head; - int maxlevels, i; - - maxlevels = 1 + Log2((double) expected_size); - result = (skiplist_t*) malloc(sizeof(skiplist_t)); - if (!result) { - return NULL; - } - result->tmp_chain = (node_t**) malloc(maxlevels * sizeof(node_t*)); - result->tmp_steps = (int*) malloc(maxlevels * sizeof(int)); - result->maxlevels = maxlevels; - result->size = 0; - - head = result->head = node_init(PANDAS_NAN, maxlevels); - NIL = node_init(0.0, 0); - - if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { - skiplist_destroy(result); - node_destroy(NIL); - return NULL; - } - - node_incref(head); - - NIL->is_nil = 1; - - for (i = 0; i < maxlevels; ++i) - { - head->next[i] = NIL; - head->width[i] = 1; - node_incref(NIL); - } - - return result; + skiplist_t *result; + node_t *NIL, *head; + int maxlevels, i; + + maxlevels = 1 + Log2((double)expected_size); + result = (skiplist_t *)malloc(sizeof(skiplist_t)); + if (!result) { + return NULL; + } + result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *)); + result->tmp_steps = (int *)malloc(maxlevels * sizeof(int)); + result->maxlevels = maxlevels; + result->size = 0; + + head = result->head = node_init(PANDAS_NAN, maxlevels); + NIL = node_init(0.0, 0); + + if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { + skiplist_destroy(result); + node_destroy(NIL); + return NULL; + } + + node_incref(head); + + NIL->is_nil = 1; + + for (i = 0; i < maxlevels; ++i) { + head->next[i] = NIL; + head->width[i] = 1; + node_incref(NIL); + } + + return result; } // 1 if left < right, 0 if left == right, -1 if left > right -PANDAS_INLINE int _node_cmp(node_t* node, double value){ - if (node->is_nil || node->value > value) { - return -1; - } - else if (node->value < value) { - return 1; - } - else { - return 0; - } +PANDAS_INLINE int _node_cmp(node_t *node, double value) { + if (node->is_nil || node->value > value) { + return -1; + } else if (node->value < value) { + return 1; + } else { + return 0; + } } PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { - node_t *node; - int level; - - if (i < 0 || i >= skp->size) { - *ret = 0; - return 0; - } - - node = skp->head; - ++i; - for (level = skp->maxlevels - 1; level >= 0; --level) - { - while (node->width[level] <= i) - { - i -= node->width[level]; - node = node->next[level]; + node_t *node; + int level; + + if (i < 0 || i >= skp->size) { + *ret = 0; + return 0; + } + + node = skp->head; + ++i; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (node->width[level] <= i) { + i -= node->width[level]; + node = node->next[level]; + } } - } - *ret = 1; - return node->value; + *ret = 1; + return node->value; } PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { - node_t *node, *prevnode, *newnode, *next_at_level; - int *steps_at_level; - int size, steps, level; - node_t **chain; - - chain = skp->tmp_chain; - - steps_at_level = skp->tmp_steps; - memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); - - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) - { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) >= 0) { - steps_at_level[level] += node->width[level]; - node = next_at_level; - next_at_level = node->next[level]; + node_t *node, *prevnode, *newnode, *next_at_level; + int *steps_at_level; + int size, steps, level; + node_t **chain; + + chain = skp->tmp_chain; + + steps_at_level = skp->tmp_steps; + memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); + + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) >= 0) { + steps_at_level[level] += node->width[level]; + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; } - chain[level] = node; - } - size = int_min(skp->maxlevels, 1 - ((int) Log2(urand()))); + size = int_min(skp->maxlevels, 1 - ((int)Log2(urand()))); - newnode = node_init(value, size); - if (!newnode) { - return -1; - } - steps = 0; + newnode = node_init(value, size); + if (!newnode) { + return -1; + } + steps = 0; - for (level = 0; level < size; ++level) { - prevnode = chain[level]; - newnode->next[level] = prevnode->next[level]; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + newnode->next[level] = prevnode->next[level]; - prevnode->next[level] = newnode; - node_incref(newnode); // increment the reference count + prevnode->next[level] = newnode; + node_incref(newnode); // increment the reference count - newnode->width[level] = prevnode->width[level] - steps; - prevnode->width[level] = steps + 1; + newnode->width[level] = prevnode->width[level] - steps; + prevnode->width[level] = steps + 1; - steps += steps_at_level[level]; - } + steps += steps_at_level[level]; + } - for (level = size; level < skp->maxlevels; ++level) { - chain[level]->width[level] += 1; - } + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] += 1; + } - ++(skp->size); + ++(skp->size); - return 1; + return 1; } PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { - int level, size; - node_t *node, *prevnode, *tmpnode, *next_at_level; - node_t **chain; - - chain = skp->tmp_chain; - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) - { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) > 0) { - node = next_at_level; - next_at_level = node->next[level]; + int level, size; + node_t *node, *prevnode, *tmpnode, *next_at_level; + node_t **chain; + + chain = skp->tmp_chain; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) > 0) { + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; } - chain[level] = node; - } - if (value != chain[0]->next[0]->value) { - return 0; - } + if (value != chain[0]->next[0]->value) { + return 0; + } - size = chain[0]->next[0]->levels; + size = chain[0]->next[0]->levels; - for (level = 0; level < size; ++level) { - prevnode = chain[level]; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; - tmpnode = prevnode->next[level]; + tmpnode = prevnode->next[level]; - prevnode->width[level] += tmpnode->width[level] - 1; - prevnode->next[level] = tmpnode->next[level]; + prevnode->width[level] += tmpnode->width[level] - 1; + prevnode->next[level] = tmpnode->next[level]; - tmpnode->next[level] = NULL; - node_destroy(tmpnode); // decrement refcount or free - } + tmpnode->next[level] = NULL; + node_destroy(tmpnode); // decrement refcount or free + } - for (level = size; level < skp->maxlevels; ++level) { - --(chain[level]->width[level]); - } + for (level = size; level < skp->maxlevels; ++level) { + --(chain[level]->width[level]); + } - --(skp->size); - return 1; + --(skp->size); + return 1; } + +#endif // PANDAS_SRC_SKIPLIST_H_ diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h index c37fe8c8e6c38..3bfb4b26c0095 100644 --- a/pandas/src/ujson/lib/ultrajson.h +++ b/pandas/src/ujson/lib/ultrajson.h @@ -49,8 +49,8 @@ tree doesn't have cyclic references. */ -#ifndef __ULTRAJSON_H__ -#define __ULTRAJSON_H__ +#ifndef PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ +#define PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ #include #include @@ -143,25 +143,23 @@ typedef int64_t JSLONG; #error "Endianess not supported" #endif -enum JSTYPES -{ - JT_NULL, // NULL - JT_TRUE, //boolean true - JT_FALSE, //boolean false - JT_INT, //(JSINT32 (signed 32-bit)) - JT_LONG, //(JSINT64 (signed 64-bit)) - JT_DOUBLE, //(double) - JT_UTF8, //(char 8-bit) - JT_ARRAY, // Array structure - JT_OBJECT, // Key/Value structure - JT_INVALID, // Internal, do not return nor expect +enum JSTYPES { + JT_NULL, // NULL + JT_TRUE, // boolean true + JT_FALSE, // boolean false + JT_INT, // (JSINT32 (signed 32-bit)) + JT_LONG, // (JSINT64 (signed 64-bit)) + JT_DOUBLE, // (double) + JT_UTF8, // (char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect }; typedef void * JSOBJ; typedef void * JSITER; -typedef struct __JSONTypeContext -{ +typedef struct __JSONTypeContext { int type; void *encoder; void *prv; @@ -173,16 +171,17 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); -typedef struct __JSONObjectEncoder -{ +typedef struct __JSONObjectEncoder { void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); - const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen); JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); @@ -256,10 +255,8 @@ typedef struct __JSONObjectEncoder char *end; int heap; int level; - } JSONObjectEncoder; - /* Encode an object structure into JSON. @@ -279,12 +276,10 @@ Life cycle of the provided buffer must still be handled by caller. If the return value doesn't equal the specified buffer caller must release the memory using JSONObjectEncoder.free or free() as specified when calling this function. */ -EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); - +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, + char *buffer, size_t cbBuffer); - -typedef struct __JSONObjectDecoder -{ +typedef struct __JSONObjectDecoder { JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end); int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value); @@ -308,7 +303,8 @@ typedef struct __JSONObjectDecoder void *prv; } JSONObjectDecoder; -EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, + const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#endif +#endif // PANDAS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c index 5496068832f2e..a847b0f5d5102 100644 --- a/pandas/src/ujson/lib/ultrajsondec.c +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -16,8 +16,10 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT @@ -27,7 +29,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -35,15 +38,15 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ -#include "ultrajson.h" -#include #include -#include -#include -#include -#include #include +#include #include +#include +#include +#include +#include +#include "ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -53,871 +56,1096 @@ Numeric decoder derived from from TCL library #define NULL 0 #endif -struct DecoderState -{ - char *start; - char *end; - wchar_t *escStart; - wchar_t *escEnd; - int escHeap; - int lastType; - JSUINT32 objDepth; - void *prv; - JSONObjectDecoder *dec; +struct DecoderState { + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSUINT32 objDepth; + void *prv; + JSONObjectDecoder *dec; }; -JSOBJ FASTCALL_MSVC decode_any( struct DecoderState *ds) FASTCALL_ATTR; -typedef JSOBJ (*PFN_DECODER)( struct DecoderState *ds); +JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) FASTCALL_ATTR; +typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds); -static JSOBJ SetError( struct DecoderState *ds, int offset, const char *message) -{ - ds->dec->errorOffset = ds->start + offset; - ds->dec->errorStr = (char *) message; - return NULL; +static JSOBJ SetError(struct DecoderState *ds, int offset, + const char *message) { + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *)message; + return NULL; } -double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) -{ - static const double g_pow10[] = {1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001,0.0000001, 0.00000001, 0.000000001, 0.0000000001, 0.00000000001, 0.000000000001, 0.0000000000001, 0.00000000000001, 0.000000000000001}; - return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; +double createDouble(double intNeg, double intValue, double frcValue, + int frcDecimalCount) { + static const double g_pow10[] = {1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001}; + return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) -{ - char *end; - double value; - errno = 0; +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { + char *end; + double value; + errno = 0; - value = strtod(ds->start, &end); + value = strtod(ds->start, &end); - if (errno == ERANGE) - { - return SetError(ds, -1, "Range error when decoding numeric as double"); - } + if (errno == ERANGE) { + return SetError(ds, -1, "Range error when decoding numeric as double"); + } - ds->start = end; - return ds->dec->newDouble(ds->prv, value); + ds->start = end; + return ds->dec->newDouble(ds->prv, value); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric (struct DecoderState *ds) -{ - int intNeg = 1; - int mantSize = 0; - JSUINT64 intValue; - int chr; - int decimalCount = 0; - double frcValue = 0.0; - double expNeg; - double expValue; - char *offset = ds->start; - - JSUINT64 overflowLimit = LLONG_MAX; - - if (*(offset) == '-') - { - offset ++; - intNeg = -1; - overflowLimit = LLONG_MIN; - } - - // Scan integer part - intValue = 0; - - while (1) - { - chr = (int) (unsigned char) *(offset); - - switch (chr) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - //FIXME: Check for arithemtic overflow here - //PERF: Don't do 64-bit arithmetic here unless we know we have to - intValue = intValue * 10ULL + (JSLONG) (chr - 48); - - if (intValue > overflowLimit) - { - return SetError(ds, -1, overflowLimit == LLONG_MAX ? "Value is too big" : "Value is too small"); - } +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { + int intNeg = 1; + int mantSize = 0; + JSUINT64 intValue; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expNeg; + double expValue; + char *offset = ds->start; + + JSUINT64 overflowLimit = LLONG_MAX; + + if (*(offset) == '-') { + offset++; + intNeg = -1; + overflowLimit = LLONG_MIN; + } - offset ++; - mantSize ++; - break; - } - case '.': - { - offset ++; - goto DECODE_FRACTION; - break; - } - case 'e': - case 'E': - { - offset ++; - goto DECODE_EXPONENT; - break; - } - - default: - { - goto BREAK_INT_LOOP; - break; - } + // Scan integer part + intValue = 0; + + while (1) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // FIXME: Check for arithemtic overflow here + // PERF: Don't do 64-bit arithmetic here unless we know we have + // to + intValue = intValue * 10ULL + (JSLONG)(chr - 48); + + if (intValue > overflowLimit) { + return SetError(ds, -1, overflowLimit == LLONG_MAX + ? "Value is too big" + : "Value is too small"); + } + + offset++; + mantSize++; + break; + } + case '.': { + offset++; + goto DECODE_FRACTION; + break; + } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; + } + + default: { + goto BREAK_INT_LOOP; + break; + } + } } - } BREAK_INT_LOOP: - ds->lastType = JT_INT; - ds->start = offset; + ds->lastType = JT_INT; + ds->start = offset; - if ((intValue >> 31)) - { - return ds->dec->newLong(ds->prv, (JSINT64) (intValue * (JSINT64) intNeg)); - } - else - { - return ds->dec->newInt(ds->prv, (JSINT32) (intValue * intNeg)); - } + if ((intValue >> 31)) { + return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); + } else { + return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); + } DECODE_FRACTION: - if (ds->dec->preciseFloat) - { - return decodePreciseFloat(ds); - } - - // Scan fraction part - frcValue = 0.0; - for (;;) - { - chr = (int) (unsigned char) *(offset); - - switch (chr) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) - { - frcValue = frcValue * 10.0 + (double) (chr - 48); - decimalCount ++; + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } + + // Scan fraction part + frcValue = 0.0; + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { + frcValue = frcValue * 10.0 + (double)(chr - 48); + decimalCount++; + } + offset++; + break; + } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; + } + default: { goto BREAK_FRC_LOOP; } } - offset ++; - break; - } - case 'e': - case 'E': - { - offset ++; - goto DECODE_EXPONENT; - break; - } - default: - { - goto BREAK_FRC_LOOP; - } } - } BREAK_FRC_LOOP: - //FIXME: Check for arithemtic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble (ds->prv, createDouble( (double) intNeg, (double) intValue, frcValue, decimalCount)); + // FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); DECODE_EXPONENT: - if (ds->dec->preciseFloat) - { - return decodePreciseFloat(ds); - } - - expNeg = 1.0; - - if (*(offset) == '-') - { - expNeg = -1.0; - offset ++; - } - else - if (*(offset) == '+') - { - expNeg = +1.0; - offset ++; - } - - expValue = 0.0; - - for (;;) - { - chr = (int) (unsigned char) *(offset); - - switch (chr) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - expValue = expValue * 10.0 + (double) (chr - 48); - offset ++; - break; - } - default: - { - goto BREAK_EXP_LOOP; - } + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } + + expNeg = 1.0; + + if (*(offset) == '-') { + expNeg = -1.0; + offset++; + } else if (*(offset) == '+') { + expNeg = +1.0; + offset++; + } + + expValue = 0.0; + + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + expValue = expValue * 10.0 + (double)(chr - 48); + offset++; + break; + } + default: { goto BREAK_EXP_LOOP; } + } } - } BREAK_EXP_LOOP: - //FIXME: Check for arithemtic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble (ds->prv, createDouble( (double) intNeg, (double) intValue , frcValue, decimalCount) * pow(10.0, expValue * expNeg)); + // FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * + pow(10.0, expValue * expNeg)); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true ( struct DecoderState *ds) -{ - char *offset = ds->start; - offset ++; +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { + char *offset = ds->start; + offset++; - if (*(offset++) != 'r') - goto SETERROR; - if (*(offset++) != 'u') - goto SETERROR; - if (*(offset++) != 'e') - goto SETERROR; + if (*(offset++) != 'r') goto SETERROR; + if (*(offset++) != 'u') goto SETERROR; + if (*(offset++) != 'e') goto SETERROR; - ds->lastType = JT_TRUE; - ds->start = offset; - return ds->dec->newTrue(ds->prv); + ds->lastType = JT_TRUE; + ds->start = offset; + return ds->dec->newTrue(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'true'"); + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false ( struct DecoderState *ds) -{ - char *offset = ds->start; - offset ++; +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { + char *offset = ds->start; + offset++; - if (*(offset++) != 'a') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; - if (*(offset++) != 's') - goto SETERROR; - if (*(offset++) != 'e') - goto SETERROR; + if (*(offset++) != 'a') goto SETERROR; + if (*(offset++) != 'l') goto SETERROR; + if (*(offset++) != 's') goto SETERROR; + if (*(offset++) != 'e') goto SETERROR; - ds->lastType = JT_FALSE; - ds->start = offset; - return ds->dec->newFalse(ds->prv); + ds->lastType = JT_FALSE; + ds->start = offset; + return ds->dec->newFalse(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null ( struct DecoderState *ds) -{ - char *offset = ds->start; - offset ++; +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { + char *offset = ds->start; + offset++; - if (*(offset++) != 'u') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; + if (*(offset++) != 'u') goto SETERROR; + if (*(offset++) != 'l') goto SETERROR; + if (*(offset++) != 'l') goto SETERROR; - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'null'"); + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); } -FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) -{ - char *offset; - - for (offset = ds->start; (ds->end - offset) > 0; offset ++) - { - switch (*offset) - { - case ' ': - case '\t': - case '\r': - case '\n': - break; - - default: - ds->start = offset; - return; +FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { + char *offset; + + for (offset = ds->start; (ds->end - offset) > 0; offset++) { + switch (*offset) { + case ' ': + case '\t': + case '\r': + case '\n': + break; + + default: + ds->start = offset; + return; + } } - } - if (offset == ds->end) - { - ds->start = ds->end; - } + if (offset == ds->end) { + ds->start = ds->end; + } } -enum DECODESTRINGSTATE -{ - DS_ISNULL = 0x32, - DS_ISQUOTE, - DS_ISESCAPE, - DS_UTFLENERROR, - +enum DECODESTRINGSTATE { + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, }; -static const JSUINT8 g_decoderLookup[256] = -{ - /* 0x00 */ DS_ISNULL, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x20 */ 1, 1, DS_ISQUOTE, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, DS_ISESCAPE, 1, 1, 1, - /* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - /* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - /* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - /* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - /* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, +static const JSUINT8 g_decoderLookup[256] = { + /* 0x00 */ DS_ISNULL, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x10 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x20 */ 1, + 1, + DS_ISQUOTE, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x30 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x40 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x50 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + DS_ISESCAPE, + 1, + 1, + 1, + /* 0x60 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x70 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x80 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x90 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xa0 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xb0 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xc0 */ 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + /* 0xd0 */ 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + /* 0xe0 */ 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + /* 0xf0 */ 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, + DS_UTFLENERROR, }; -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) -{ - JSUTF16 sur[2] = { 0 }; - int iSur = 0; - int index; - wchar_t *escOffset; - wchar_t *escStart; - size_t escLen = (ds->escEnd - ds->escStart); - JSUINT8 *inputOffset; - JSUINT8 oct; - JSUTF32 ucs; - ds->lastType = JT_INVALID; - ds->start ++; - - if ( (size_t) (ds->end - ds->start) > escLen) - { - size_t newSize = (ds->end - ds->start); - - if (ds->escHeap) - { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) - { - return SetError(ds, -1, "Could not reserve memory block"); - } - escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); - if (!escStart) - { - ds->dec->free(ds->escStart); - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = escStart; - } - else - { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) - { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t)); - if (!ds->escStart) - { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); - } +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { + JSUTF16 sur[2] = {0}; + int iSur = 0; + int index; + wchar_t *escOffset; + wchar_t *escStart; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start++; - ds->escEnd = ds->escStart + newSize; - } - - escOffset = ds->escStart; - inputOffset = (JSUINT8 *) ds->start; - - for (;;) - { - switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) - { - case DS_ISNULL: - { - return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); - } - case DS_ISQUOTE: - { - ds->lastType = JT_UTF8; - inputOffset ++; - ds->start += ( (char *) inputOffset - (ds->start)); - return ds->dec->newString(ds->prv, ds->escStart, escOffset); - } - case DS_UTFLENERROR: - { - return SetError (ds, -1, "Invalid UTF-8 sequence length when decoding 'string'"); - } - case DS_ISESCAPE: - inputOffset ++; - switch (*inputOffset) - { - case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; - case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; - case '/': *(escOffset++) = L'/'; inputOffset++; continue; - case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; - case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; - case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; - case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; - case 't': *(escOffset++) = L'\t'; inputOffset++; continue; - - case 'u': - { - int index; - inputOffset ++; - - for (index = 0; index < 4; index ++) - { - switch (*inputOffset) - { - case '\0': return SetError (ds, -1, "Unterminated unicode escape sequence when decoding 'string'"); - default: return SetError (ds, -1, "Unexpected character in unicode escape sequence when decoding 'string'"); - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); - break; - - case 'a': - case 'b': - case 'c': - case 'd': - case 'e': - case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); - break; - - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); - break; - } - - inputOffset ++; + if ((size_t)(ds->end - ds->start) > escLen) { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) { + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + escStart = (wchar_t *)ds->dec->realloc(ds->escStart, + newSize * sizeof(wchar_t)); + if (!escStart) { + ds->dec->free(ds->escStart); + return SetError(ds, -1, "Could not reserve memory block"); } + ds->escStart = escStart; + } else { + wchar_t *oldStart = ds->escStart; + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = + (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); + if (!ds->escStart) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escHeap = 1; + memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } - if (iSur == 0) - { - if((sur[iSur] & 0xfc00) == 0xd800) - { - // First of a surrogate pair, continue parsing - iSur ++; - break; - } - (*escOffset++) = (wchar_t) sur[iSur]; - iSur = 0; + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = (JSUINT8 *)ds->start; + + for (;;) { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { + case DS_ISNULL: { + return SetError(ds, -1, + "Unmatched ''\"' when when decoding 'string'"); + } + case DS_ISQUOTE: { + ds->lastType = JT_UTF8; + inputOffset++; + ds->start += ((char *)inputOffset - (ds->start)); + return ds->dec->newString(ds->prv, ds->escStart, escOffset); } - else - { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) - { - return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); - } + case DS_UTFLENERROR: { + return SetError( + ds, -1, + "Invalid UTF-8 sequence length when decoding 'string'"); + } + case DS_ISESCAPE: + inputOffset++; + switch (*inputOffset) { + case '\\': + *(escOffset++) = L'\\'; + inputOffset++; + continue; + case '\"': + *(escOffset++) = L'\"'; + inputOffset++; + continue; + case '/': + *(escOffset++) = L'/'; + inputOffset++; + continue; + case 'b': + *(escOffset++) = L'\b'; + inputOffset++; + continue; + case 'f': + *(escOffset++) = L'\f'; + inputOffset++; + continue; + case 'n': + *(escOffset++) = L'\n'; + inputOffset++; + continue; + case 'r': + *(escOffset++) = L'\r'; + inputOffset++; + continue; + case 't': + *(escOffset++) = L'\t'; + inputOffset++; + continue; + + case 'u': { + int index; + inputOffset++; + + for (index = 0; index < 4; index++) { + switch (*inputOffset) { + case '\0': + return SetError(ds, -1, + "Unterminated unicode " + "escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unexpected character in " + "unicode escape sequence " + "when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + + (JSUTF16)(*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + + (JSUTF16)(*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + + (JSUTF16)(*inputOffset - 'A'); + break; + } + + inputOffset++; + } + + if (iSur == 0) { + if ((sur[iSur] & 0xfc00) == 0xd800) { + // First of a surrogate pair, continue parsing + iSur++; + break; + } + (*escOffset++) = (wchar_t)sur[iSur]; + iSur = 0; + } else { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) { + return SetError(ds, -1, + "Unpaired high surrogate when " + "decoding 'string'"); + } #if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t) sur[0]; - (*escOffset++) = (wchar_t) sur[1]; + (*escOffset++) = (wchar_t)sur[0]; + (*escOffset++) = (wchar_t)sur[1]; #else - (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); + (*escOffset++) = + (wchar_t)0x10000 + + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); #endif - iSur = 0; + iSur = 0; + } + break; + } + + case '\0': + return SetError(ds, -1, + "Unterminated escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unrecognized escape sequence when " + "decoding 'string'"); + } + break; + + case 1: { + *(escOffset++) = (wchar_t)(*inputOffset++); + break; } - break; - } - case '\0': return SetError(ds, -1, "Unterminated escape sequence when decoding 'string'"); - default: return SetError(ds, -1, "Unrecognized escape sequence when decoding 'string'"); - } - break; - - case 1: - { - *(escOffset++) = (wchar_t) (*inputOffset++); - break; - } - - case 2: - { - ucs = (*inputOffset++) & 0x1f; - ucs <<= 6; - if (((*inputOffset) & 0x80) != 0x80) - { - return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); - } - ucs |= (*inputOffset++) & 0x3f; - if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); - *(escOffset++) = (wchar_t) ucs; - break; - } - - case 3: - { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x0f; - - for (index = 0; index < 2; index ++) - { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) - { - return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); - } - - ucs |= oct & 0x3f; - } + case 2: { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) + return SetError(ds, -1, + "Overlong 2 byte UTF-8 sequence detected " + "when decoding 'string'"); + *(escOffset++) = (wchar_t)ucs; + break; + } - if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); - *(escOffset++) = (wchar_t) ucs; - break; - } + case 3: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; - case 4: - { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x07; + for (index = 0; index < 2; index++) { + ucs <<= 6; + oct = (*inputOffset++); - for (index = 0; index < 3; index ++) - { - ucs <<= 6; - oct = (*inputOffset++); + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } - if ((oct & 0x80) != 0x80) - { - return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); - } + ucs |= oct & 0x3f; + } - ucs |= oct & 0x3f; - } + if (ucs < 0x800) + return SetError(ds, -1, + "Overlong 3 byte UTF-8 sequence detected " + "when encoding string"); + *(escOffset++) = (wchar_t)ucs; + break; + } - if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); + case 4: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index++) { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) + return SetError(ds, -1, + "Overlong 4 byte UTF-8 sequence detected " + "when decoding 'string'"); #if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) - { - ucs -= 0x10000; - *(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00; - } - else - { - *(escOffset++) = (wchar_t) ucs; - } + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; + *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; + } else { + *(escOffset++) = (wchar_t)ucs; + } #else - *(escOffset++) = (wchar_t) ucs; + *(escOffset++) = (wchar_t)ucs; #endif - break; - } + break; + } + } } - } } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) -{ - JSOBJ itemValue; - JSOBJ newObj; - int len; - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); - } - - newObj = ds->dec->newArray(ds->prv, ds->dec); - len = 0; - - ds->lastType = JT_INVALID; - ds->start ++; - - for (;;) - { - SkipWhitespace(ds); - - if ((*ds->start) == ']') - { - ds->objDepth--; - if (len == 0) - { - ds->start ++; - return ds->dec->endArray(ds->prv, newObj); - } - - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError(ds, -1, "Unexpected character found when decoding array value (1)"); +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { + JSOBJ itemValue; + JSOBJ newObj; + int len; + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); } - itemValue = decode_any(ds); + newObj = ds->dec->newArray(ds->prv, ds->dec); + len = 0; - if (itemValue == NULL) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + ds->lastType = JT_INVALID; + ds->start++; - if (!ds->dec->arrayAddItem (ds->prv, newObj, itemValue)) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + for (;;) { + SkipWhitespace(ds); + + if ((*ds->start) == ']') { + ds->objDepth--; + if (len == 0) { + ds->start++; + return ds->dec->endArray(ds->prv, newObj); + } + + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, + "Unexpected character found when decoding array value (1)"); + } - SkipWhitespace(ds); + itemValue = decode_any(ds); + + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } + + if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) { + case ']': { + ds->objDepth--; + return ds->dec->endArray(ds->prv, newObj); + } + case ',': + break; - switch (*(ds->start++)) - { - case ']': - { - ds->objDepth--; - return ds->dec->endArray(ds->prv, newObj); + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, + "Unexpected character found when decoding array value (2)"); + } + + len++; } - case ',': - break; +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj; - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError(ds, -1, "Unexpected character found when decoding array value (2)"); + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); } - len ++; - } -} + newObj = ds->dec->newObject(ds->prv, ds->dec); -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object( struct DecoderState *ds) -{ - JSOBJ itemName; - JSOBJ itemValue; - JSOBJ newObj; + ds->start++; - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); - } + for (;;) { + SkipWhitespace(ds); - newObj = ds->dec->newObject(ds->prv, ds->dec); + if ((*ds->start) == '}') { + ds->objDepth--; + ds->start++; + return ds->dec->endObject(ds->prv, newObj); + } - ds->start ++; + ds->lastType = JT_INVALID; + itemName = decode_any(ds); - for (;;) - { - SkipWhitespace(ds); + if (itemName == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } - if ((*ds->start) == '}') - { - ds->objDepth--; - ds->start ++; - return ds->dec->endObject(ds->prv, newObj); - } + if (ds->lastType != JT_UTF8) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError( + ds, -1, + "Key name of object must be 'string' when decoding 'object'"); + } - ds->lastType = JT_INVALID; - itemName = decode_any(ds); + SkipWhitespace(ds); - if (itemName == NULL) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + if (*(ds->start++) != ':') { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } - if (ds->lastType != JT_UTF8) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError(ds, -1, "Key name of object must be 'string' when decoding 'object'"); - } + SkipWhitespace(ds); - SkipWhitespace(ds); + itemValue = decode_any(ds); - if (*(ds->start++) != ':') - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError(ds, -1, "No ':' found when decoding object value"); - } + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return NULL; + } - SkipWhitespace(ds); + if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + ds->dec->releaseObject(ds->prv, itemValue, ds->dec); + return NULL; + } - itemValue = decode_any(ds); + SkipWhitespace(ds); - if (itemValue == NULL) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return NULL; - } + switch (*(ds->start++)) { + case '}': { + ds->objDepth--; + return ds->dec->endObject(ds->prv, newObj); + } + case ',': + break; - if (!ds->dec->objectAddKey (ds->prv, newObj, itemName, itemValue)) - { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - ds->dec->releaseObject(ds->prv, itemValue, ds->dec); - return NULL; - } - - SkipWhitespace(ds); - - switch (*(ds->start++)) - { - case '}': - { - ds->objDepth--; - return ds->dec->endObject(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError(ds, -1, "Unexpected character found when decoding object value"); + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, + "Unexpected character found when decoding object value"); + } } - } } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) -{ - for (;;) - { - switch (*ds->start) - { - case '\"': - return decode_string (ds); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case '-': - return decode_numeric (ds); - - case '[': return decode_array (ds); - case '{': return decode_object (ds); - case 't': return decode_true (ds); - case 'f': return decode_false (ds); - case 'n': return decode_null (ds); - - case ' ': - case '\t': - case '\r': - case '\n': - // White space - ds->start ++; - break; - - default: - return SetError(ds, -1, "Expected object or value"); +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { + for (;;) { + switch (*ds->start) { + case '\"': + return decode_string(ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return decode_numeric(ds); + + case '[': + return decode_array(ds); + case '{': + return decode_object(ds); + case 't': + return decode_true(ds); + case 'f': + return decode_false(ds); + case 'n': + return decode_null(ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); + } } - } } -JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) -{ - /* - FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ - char *locale; - struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; - JSOBJ ret; - - ds.start = (char *) buffer; - ds.end = ds.start + cbBuffer; - - ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); - ds.escHeap = 0; - ds.prv = dec->prv; - ds.dec = dec; - ds.dec->errorStr = NULL; - ds.dec->errorOffset = NULL; - ds.objDepth = 0; - - ds.dec = dec; - - locale = setlocale(LC_NUMERIC, NULL); - if (strcmp(locale, "C")) - { - locale = strdup(locale); - if (!locale) - { - return SetError(&ds, -1, "Could not reserve memory block"); +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, + size_t cbBuffer) { + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode + escaping doesn't run into the wall each time */ + char *locale; + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *)buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.prv = dec->prv; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + ds.objDepth = 0; + + ds.dec = dec; + + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) { + locale = strdup(locale); + if (!locale) { + return SetError(&ds, -1, "Could not reserve memory block"); + } + setlocale(LC_NUMERIC, "C"); + ret = decode_any(&ds); + setlocale(LC_NUMERIC, locale); + free(locale); + } else { + ret = decode_any(&ds); + } + + if (ds.escHeap) { + dec->free(ds.escStart); } - setlocale(LC_NUMERIC, "C"); - ret = decode_any (&ds); - setlocale(LC_NUMERIC, locale); - free(locale); - } - else - { - ret = decode_any (&ds); - } - - if (ds.escHeap) - { - dec->free(ds.escStart); - } - - SkipWhitespace(&ds); - - if (ds.start != ds.end && ret) - { - dec->releaseObject(ds.prv, ret, ds.dec); - return SetError(&ds, -1, "Trailing data"); - } - - return ret; + + SkipWhitespace(&ds); + + if (ds.start != ds.end && ret) { + dec->releaseObject(ds.prv, ret, ds.dec); + return SetError(&ds, -1, "Trailing data"); + } + + return ret; } diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c index 2adf3cb707bdb..5a15071938c1a 100644 --- a/pandas/src/ujson/lib/ultrajsonenc.c +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -16,8 +16,10 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT @@ -27,7 +29,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -35,15 +38,14 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ -#include "ultrajson.h" -#include #include -#include -#include -#include -#include - #include +#include +#include +#include +#include +#include +#include "ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -67,587 +69,821 @@ or UTF-16 surrogate pairs The extra 2 bytes are for the quotes around the string */ -#define RESERVE_STRING(_len) (2 + ((_len) * 6)) - -static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; +#define RESERVE_STRING(_len) (2 + ((_len)*6)) + +static const double g_pow10[] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000}; static const char g_hexChars[] = "0123456789abcdef"; static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; /* -FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. +FIXME: While this is fine dandy and working it's a magic value mess which +probably only the author understands. Needs a cleanup and more documentation */ /* Table for pure ascii output escaping all characters above 127 to \uXXXX */ -static const JSUINT8 g_asciiOutputTable[256] = -{ -/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, -/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, -/* 0x20 */ 1, 1, 20, 1, 1, 1, 29, 1, 1, 1, 1, 1, 1, 1, 1, 24, -/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 29, 1, 29, 1, -/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, -/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 -}; - -static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) -{ - enc->errorMsg = message; - enc->errorObj = obj; +static const JSUINT8 g_asciiOutputTable[256] = { + /* 0x00 */ 0, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 10, + 12, + 14, + 30, + 16, + 18, + 30, + 30, + /* 0x10 */ 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + 30, + /* 0x20 */ 1, + 1, + 20, + 1, + 1, + 1, + 29, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 24, + /* 0x30 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 29, + 1, + 29, + 1, + /* 0x40 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x50 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 22, + 1, + 1, + 1, + /* 0x60 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x70 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x80 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0x90 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xa0 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xb0 */ 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + /* 0xc0 */ 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + /* 0xd0 */ 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + /* 0xe0 */ 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + /* 0xf0 */ 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 5, + 5, + 5, + 5, + 6, + 6, + 1, + 1}; + +static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) { + enc->errorMsg = message; + enc->errorObj = obj; } /* -FIXME: Keep track of how big these get across several encoder calls and try to make an estimate +FIXME: Keep track of how big these get across several encoder calls and try to +make an estimate That way we won't run our head into the wall each call */ -void Buffer_Realloc (JSONObjectEncoder *enc, size_t cbNeeded) -{ - size_t curSize = enc->end - enc->start; - size_t newSize = curSize * 2; - size_t offset = enc->offset - enc->start; - - while (newSize < curSize + cbNeeded) - { - newSize *= 2; - } - - if (enc->heap) - { - enc->start = (char *) enc->realloc (enc->start, newSize); - if (!enc->start) - { - SetError (NULL, enc, "Could not reserve memory block"); - return; +void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) { + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; + + while (newSize < curSize + cbNeeded) { + newSize *= 2; } - } - else - { - char *oldStart = enc->start; - enc->heap = 1; - enc->start = (char *) enc->malloc (newSize); - if (!enc->start) - { - SetError (NULL, enc, "Could not reserve memory block"); - return; + + if (enc->heap) { + enc->start = (char *)enc->realloc(enc->start, newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; + } + } else { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *)enc->malloc(newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; + } + memcpy(enc->start, oldStart, offset); } - memcpy (enc->start, oldStart, offset); - } - enc->offset = enc->start + offset; - enc->end = enc->start + newSize; + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; } -FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (char *outputOffset, unsigned short value) -{ - *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; - *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; - *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; - *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC +Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) { + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; } -int Buffer_EscapeStringUnvalidated (JSONObjectEncoder *enc, const char *io, const char *end) -{ - char *of = (char *) enc->offset; - - for (;;) - { - switch (*io) - { - case 0x00: - { - if (io < end) - { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - break; - } - else - { - enc->offset += (of - enc->offset); - return TRUE; - } - } - case '\"': (*of++) = '\\'; (*of++) = '\"'; break; - case '\\': (*of++) = '\\'; (*of++) = '\\'; break; - case '/': (*of++) = '\\'; (*of++) = '/'; break; - case '\b': (*of++) = '\\'; (*of++) = 'b'; break; - case '\f': (*of++) = '\\'; (*of++) = 'f'; break; - case '\n': (*of++) = '\\'; (*of++) = 'n'; break; - case '\r': (*of++) = '\\'; (*of++) = 'r'; break; - case '\t': (*of++) = '\\'; (*of++) = 't'; break; - - case 0x26: // '/' - case 0x3c: // '<' - case 0x3e: // '>' - { - if (enc->encodeHTMLChars) - { - // Fall through to \u00XX case below. - } - else - { - // Same as default case below. - (*of++) = (*io); - break; +int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, + const char *end) { + char *of = (char *)enc->offset; + + for (;;) { + switch (*io) { + case 0x00: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } + case '\"': + (*of++) = '\\'; + (*of++) = '\"'; + break; + case '\\': + (*of++) = '\\'; + (*of++) = '\\'; + break; + case '/': + (*of++) = '\\'; + (*of++) = '/'; + break; + case '\b': + (*of++) = '\\'; + (*of++) = 'b'; + break; + case '\f': + (*of++) = '\\'; + (*of++) = 'f'; + break; + case '\n': + (*of++) = '\\'; + (*of++) = 'n'; + break; + case '\r': + (*of++) = '\\'; + (*of++) = 'r'; + break; + case '\t': + (*of++) = '\\'; + (*of++) = 't'; + break; + + case 0x26: // '/' + case 0x3c: // '<' + case 0x3e: // '>' + { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case below. + } else { + // Same as default case below. + (*of++) = (*io); + break; + } + } + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + break; + } + default: + (*of++) = (*io); + break; } - } - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0b: - case 0x0e: - case 0x0f: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1a: - case 0x1b: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: - { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; - break; - } - default: (*of++) = (*io); break; - } - io++; - } + io++; + } } -int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) -{ - JSUTF32 ucs; - char *of = (char *) enc->offset; - - for (;;) - { - JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; - - switch (utflen) - { - case 0: - { - if (io < end) - { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - io ++; - continue; - } - else - { - enc->offset += (of - enc->offset); - return TRUE; - } - } - - case 1: - { - *(of++)= (*io++); - continue; - } - - case 2: - { - JSUTF32 in; - JSUTF16 in16; - - if (end - io < 1) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - in = (JSUTF32) in16; +int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, + const char *io, const char *end) { + JSUTF32 ucs; + char *of = (char *)enc->offset; + + for (;;) { + JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; + + switch (utflen) { + case 0: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io++; + continue; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: { + *(of++) = (*io++); + continue; + } + + case 2: { + JSUTF32 in; + JSUTF16 in16; + + if (end - io < 1) { + enc->offset += (of - enc->offset); + SetError( + obj, enc, + "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32)in16; #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); #else - ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x80) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Overlong 2 byte UTF-8 sequence detected when encoding string"); - return FALSE; - } - - io += 2; - break; - } - - case 3: - { - JSUTF32 in; - JSUTF16 in16; - JSUINT8 in8; - - if (end - io < 2) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - memcpy(&in8, io + 2, sizeof(JSUINT8)); + if (ucs < 0x80) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 2 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) { + enc->offset += (of - enc->offset); + SetError( + obj, enc, + "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); #ifdef __LITTLE_ENDIAN__ - in = (JSUTF32) in16; - in |= in8 << 16; - ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); + in = (JSUTF32)in16; + in |= in8 << 16; + ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | + ((in & 0x3f0000) >> 16); #else - in = in16 << 8; - in |= in8; - ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); + in = in16 << 8; + in |= in8; + ucs = + ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x800) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Overlong 3 byte UTF-8 sequence detected when encoding string"); - return FALSE; - } - - io += 3; - break; - } - case 4: - { - JSUTF32 in; - - if (end - io < 3) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in, io, sizeof(JSUTF32)); + if (ucs < 0x800) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 3 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: { + JSUTF32 in; + + if (end - io < 3) { + enc->offset += (of - enc->offset); + SetError( + obj, enc, + "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in, io, sizeof(JSUTF32)); #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | + ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); #else - ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | + ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x10000) - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Overlong 4 byte UTF-8 sequence detected when encoding string"); - return FALSE; + if (ucs < 0x10000) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 4 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 4; + break; + } + + case 5: + case 6: { + enc->offset += (of - enc->offset); + SetError( + obj, enc, + "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + } + + case 29: { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case 30 below. + } else { + // Same as case 1 above. + *(of++) = (*io++); + continue; + } + } + + case 30: { + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + io++; + continue; + } + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: { + *(of++) = *((char *)(g_escapeChars + utflen + 0)); + *(of++) = *((char *)(g_escapeChars + utflen + 1)); + io++; + continue; + } + // This can never happen, it's here to make L4 VC++ happy + default: { + ucs = 0; + break; + } } - io += 4; - break; - } - - - case 5: - case 6: - { - enc->offset += (of - enc->offset); - SetError (obj, enc, "Unsupported UTF-8 sequence length when encoding string"); - return FALSE; - } - - case 29: - { - if (enc->encodeHTMLChars) - { - // Fall through to \u00XX case 30 below. - } - else - { - // Same as case 1 above. - *(of++) = (*io++); - continue; + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked( + of, (unsigned short)(ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked( + of, (unsigned short)(ucs & 0x3ff) + 0xdc00); + of += 4; + } else { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); + of += 4; } - } - - case 30: - { - // \uXXXX encode - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; - io ++; - continue; - } - case 10: - case 12: - case 14: - case 16: - case 18: - case 20: - case 22: - case 24: - { - *(of++) = *( (char *) (g_escapeChars + utflen + 0)); - *(of++) = *( (char *) (g_escapeChars + utflen + 1)); - io ++; - continue; - } - // This can never happen, it's here to make L4 VC++ happy - default: - { - ucs = 0; - break; - } } +} - /* - If the character is a UTF8 sequence of length > 1 we end up here */ - if (ucs >= 0x10000) - { - ucs -= 0x10000; - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short) (ucs >> 10) + 0xd800); - of += 4; - - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short) (ucs & 0x3ff) + 0xdc00); - of += 4; - } - else - { - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short) ucs); - of += 4; +#define Buffer_Reserve(__enc, __len) \ + if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ + Buffer_Realloc((__enc), (__len)); \ } - } -} -#define Buffer_Reserve(__enc, __len) \ - if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ - { \ - Buffer_Realloc((__enc), (__len));\ - } \ +#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, + char *end) { + char aux; + while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux; +} -#define Buffer_AppendCharUnchecked(__enc, __chr) \ - *((__enc)->offset++) = __chr; \ +void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { + char *wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + wstr = enc->offset; -FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, char* end) -{ - char aux; - while (end > begin) - aux = *end, *end-- = *begin, *begin++ = aux; + // Conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (uvalue % 10)); + } while (uvalue /= 10); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } -void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) -{ - char* wstr; - JSUINT32 uvalue = (value < 0) ? -value : value; +void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { + char *wstr; + JSUINT64 uvalue = (value < 0) ? -value : value; - wstr = enc->offset; - // Conversion. Number is reversed. + wstr = enc->offset; + // Conversion. Number is reversed. - do *wstr++ = (char)(48 + (uvalue % 10)); while(uvalue /= 10); - if (value < 0) *wstr++ = '-'; + do { + *wstr++ = (char)(48 + (uvalue % 10ULL)); + } while (uvalue /= 10ULL); + if (value < 0) *wstr++ = '-'; - // Reverse string - strreverse(enc->offset,wstr - 1); - enc->offset += (wstr - (enc->offset)); + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } -void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) -{ - char* wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; - - wstr = enc->offset; - // Conversion. Number is reversed. +int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, + double value) { + /* if input is beyond the thresholds, revert to exponential */ + const double thres_max = (double)1e16 - 1; + const double thres_min = (double)1e-15; + char precision_str[20]; + int count; + double diff = 0.0; + char *str = enc->offset; + char *wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) { + SetError(obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } - do *wstr++ = (char)(48 + (uvalue % 10ULL)); while(uvalue /= 10ULL); - if (value < 0) *wstr++ = '-'; + if (!(value == value)) { + SetError(obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } - // Reverse string - strreverse(enc->offset,wstr - 1); - enc->offset += (wstr - (enc->offset)); -} + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) { + neg = 1; + value = -value; + } -int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) -{ - /* if input is beyond the thresholds, revert to exponential */ - const double thres_max = (double) 1e16 - 1; - const double thres_min = (double) 1e-15; - char precision_str[20]; - int count; - double diff = 0.0; - char* str = enc->offset; - char* wstr = str; - unsigned long long whole; - double tmp; - unsigned long long frac; - int neg; - double pow10; - - if (value == HUGE_VAL || value == -HUGE_VAL) - { - SetError (obj, enc, "Invalid Inf value when encoding double"); - return FALSE; - } - - if (!(value == value)) - { - SetError (obj, enc, "Invalid Nan value when encoding double"); - return FALSE; - } - - /* we'll work in positive values and deal with the - negative sign issue later */ - neg = 0; - if (value < 0) - { - neg = 1; - value = -value; - } - - /* - for very large or small numbers switch back to native sprintf for - exponentials. anyone want to write code to replace this? */ - if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) - { - precision_str[0] = '%'; - precision_str[1] = '.'; + /* + for very large or small numbers switch back to native sprintf for + exponentials. anyone want to write code to replace this? */ + if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { + precision_str[0] = '%'; + precision_str[1] = '.'; #if defined(_WIN32) && defined(_MSC_VER) - sprintf_s(precision_str+2, sizeof(precision_str)-2, "%ug", enc->doublePrecision); - enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, neg ? -value : value); + sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #else - snprintf(precision_str+2, sizeof(precision_str)-2, "%ug", enc->doublePrecision); - enc->offset += snprintf(str, enc->end - enc->offset, precision_str, neg ? -value : value); + snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += snprintf(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #endif - return TRUE; - } - - pow10 = g_pow10[enc->doublePrecision]; - - whole = (unsigned long long) value; - tmp = (value - whole) * pow10; - frac = (unsigned long long)(tmp); - diff = tmp - frac; - - if (diff > 0.5) - { - ++frac; - /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ - if (frac >= pow10) - { - frac = 0; - ++whole; - } - } - else - if (diff == 0.5 && ((frac == 0) || (frac & 1))) - { - /* if halfway, round up if odd, OR - if last digit is 0. That last part is strange */ - ++frac; - } - - if (enc->doublePrecision == 0) - { - diff = value - whole; - - if (diff > 0.5) - { - /* greater than 0.5, round up, e.g. 1.6 -> 2 */ - ++whole; + return TRUE; } - else - if (diff == 0.5 && (whole & 1)) - { - /* exactly 0.5 and ODD, then round up */ - /* 1.5 -> 2, but 2.5 -> 2 */ - ++whole; + + pow10 = g_pow10[enc->doublePrecision]; + + whole = (unsigned long long)value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ + if (frac >= pow10) { + frac = 0; + ++whole; + } + } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; } - //vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 - } - else - if (frac) - { - count = enc->doublePrecision; - // now do fractional part, as an unsigned number - // we know it is not 0 but we can have leading zeros, these - // should be removed - while (!(frac % 10)) - { - --count; - frac /= 10; - } - //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - - // now do fractional part, as an unsigned number - do - { - --count; - *wstr++ = (char)(48 + (frac % 10)); - } while (frac /= 10); - // add extra 0s - while (count-- > 0) - { + if (enc->doublePrecision == 0) { + diff = value - whole; + + if (diff > 0.5) { + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } else if (diff == 0.5 && (whole & 1)) { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } else if (frac) { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) { + --count; + frac /= 10; + } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 + + // now do fractional part, as an unsigned number + do { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } else { *wstr++ = '0'; - } - // add decimal - *wstr++ = '.'; - } - else - { - *wstr++ = '0'; - *wstr++ = '.'; + *wstr++ = '.'; } - // do whole part - // Take care of sign - // Conversion. Number is reversed. - do *wstr++ = (char)(48 + (whole % 10)); while (whole /= 10); + // Do whole part. Take care of sign + // conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (whole % 10)); + } while (whole /= 10); - if (neg) - { - *wstr++ = '-'; + if (neg) { + *wstr++ = '-'; } - strreverse(str, wstr-1); + strreverse(str, wstr - 1); enc->offset += (wstr - (enc->offset)); return TRUE; @@ -661,287 +897,248 @@ Handle integration functions returning NULL here */ FIXME: Perhaps implement recursion detection */ -void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) -{ - const char *value; - char *objName; - int count; - JSOBJ iterObj; - size_t szlen; - JSONTypeContext tc; - tc.encoder = enc; - - if (enc->level > enc->recursionMax) - { - SetError (obj, enc, "Maximum recursion level reached"); - return; - } - - /* - This reservation must hold - - length of _name as encoded worst case + - maxLength of double to string OR maxLength of JSLONG to string - */ - - Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); - if (enc->errorMsg) - { - return; - } - - if (name) - { - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) - { - if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) - { +void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, + size_t cbName) { + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) { + SetError(obj, enc, "Maximum recursion level reached"); return; - } } - else - { - if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) - { + + /* + This reservation must hold + + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + */ + + Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); + if (enc->errorMsg) { return; - } } - Buffer_AppendCharUnchecked(enc, '\"'); + if (name) { + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { + return; + } + } + + Buffer_AppendCharUnchecked(enc, '\"'); - Buffer_AppendCharUnchecked (enc, ':'); + Buffer_AppendCharUnchecked(enc, ':'); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked (enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif } enc->beginTypeContext(obj, &tc); - switch (tc.type) - { - case JT_INVALID: - { - return; - } + switch (tc.type) { + case JT_INVALID: { + return; + } - case JT_ARRAY: - { - count = 0; - enc->iterBegin(obj, &tc); + case JT_ARRAY: { + count = 0; + enc->iterBegin(obj, &tc); - Buffer_AppendCharUnchecked (enc, '['); + Buffer_AppendCharUnchecked(enc, '['); - while (enc->iterNext(obj, &tc)) - { - if (count > 0) - { - Buffer_AppendCharUnchecked (enc, ','); + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked (buffer, ' '); + Buffer_AppendCharUnchecked(buffer, ' '); #endif - } + } - iterObj = enc->iterGetValue(obj, &tc); + iterObj = enc->iterGetValue(obj, &tc); - enc->level ++; - encode (iterObj, enc, NULL, 0); - count ++; - } + enc->level++; + encode(iterObj, enc, NULL, 0); + count++; + } - enc->iterEnd(obj, &tc); - Buffer_AppendCharUnchecked (enc, ']'); - break; - } + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked(enc, ']'); + break; + } - case JT_OBJECT: - { - count = 0; - enc->iterBegin(obj, &tc); + case JT_OBJECT: { + count = 0; + enc->iterBegin(obj, &tc); - Buffer_AppendCharUnchecked (enc, '{'); + Buffer_AppendCharUnchecked(enc, '{'); - while (enc->iterNext(obj, &tc)) - { - if (count > 0) - { - Buffer_AppendCharUnchecked (enc, ','); + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked (enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif - } + } - iterObj = enc->iterGetValue(obj, &tc); - objName = enc->iterGetName(obj, &tc, &szlen); + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); - enc->level ++; - encode (iterObj, enc, objName, szlen); - count ++; - } + enc->level++; + encode(iterObj, enc, objName, szlen); + count++; + } - enc->iterEnd(obj, &tc); - Buffer_AppendCharUnchecked (enc, '}'); - break; - } - - case JT_LONG: - { - Buffer_AppendLongUnchecked (enc, enc->getLongValue(obj, &tc)); - break; - } - - case JT_INT: - { - Buffer_AppendIntUnchecked (enc, enc->getIntValue(obj, &tc)); - break; - } - - case JT_TRUE: - { - Buffer_AppendCharUnchecked (enc, 't'); - Buffer_AppendCharUnchecked (enc, 'r'); - Buffer_AppendCharUnchecked (enc, 'u'); - Buffer_AppendCharUnchecked (enc, 'e'); - break; - } - - case JT_FALSE: - { - Buffer_AppendCharUnchecked (enc, 'f'); - Buffer_AppendCharUnchecked (enc, 'a'); - Buffer_AppendCharUnchecked (enc, 'l'); - Buffer_AppendCharUnchecked (enc, 's'); - Buffer_AppendCharUnchecked (enc, 'e'); - break; - } - - - case JT_NULL: - { - Buffer_AppendCharUnchecked (enc, 'n'); - Buffer_AppendCharUnchecked (enc, 'u'); - Buffer_AppendCharUnchecked (enc, 'l'); - Buffer_AppendCharUnchecked (enc, 'l'); - break; - } - - case JT_DOUBLE: - { - if (!Buffer_AppendDoubleUnchecked (obj, enc, enc->getDoubleValue(obj, &tc))) - { - enc->endTypeContext(obj, &tc); - enc->level --; - return; - } - break; - } - - case JT_UTF8: - { - value = enc->getStringValue(obj, &tc, &szlen); - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - if (enc->errorMsg) - { - enc->endTypeContext(obj, &tc); - return; - } - Buffer_AppendCharUnchecked (enc, '\"'); - - if (enc->forceASCII) - { - if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) - { - enc->endTypeContext(obj, &tc); - enc->level --; - return; + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked(enc, '}'); + break; + } + + case JT_LONG: { + Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: { + Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: { + Buffer_AppendCharUnchecked(enc, 't'); + Buffer_AppendCharUnchecked(enc, 'r'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_FALSE: { + Buffer_AppendCharUnchecked(enc, 'f'); + Buffer_AppendCharUnchecked(enc, 'a'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 's'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_NULL: { + Buffer_AppendCharUnchecked(enc, 'n'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 'l'); + break; } - } - else - { - if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) - { - enc->endTypeContext(obj, &tc); - enc->level --; - return; + + case JT_DOUBLE: { + if (!Buffer_AppendDoubleUnchecked(obj, enc, + enc->getDoubleValue(obj, &tc))) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + break; } - } - Buffer_AppendCharUnchecked (enc, '\"'); - break; + case JT_UTF8: { + value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + + Buffer_AppendCharUnchecked(enc, '\"'); + break; + } } - } - enc->endTypeContext(obj, &tc); - enc->level --; + enc->endTypeContext(obj, &tc); + enc->level--; } -char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer) -{ - char *locale; - enc->malloc = enc->malloc ? enc->malloc : malloc; - enc->free = enc->free ? enc->free : free; - enc->realloc = enc->realloc ? enc->realloc : realloc; - enc->errorMsg = NULL; - enc->errorObj = NULL; - enc->level = 0; - - if (enc->recursionMax < 1) - { - enc->recursionMax = JSON_MAX_RECURSION_DEPTH; - } - - if (enc->doublePrecision < 0 || - enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) - { - enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; - } - - if (_buffer == NULL) - { - _cbBuffer = 32768; - enc->start = (char *) enc->malloc (_cbBuffer); - if (!enc->start) - { - SetError(obj, enc, "Could not reserve memory block"); - return NULL; +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, + size_t _cbBuffer) { + char *locale; + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; } - enc->heap = 1; - } - else - { - enc->start = _buffer; - enc->heap = 0; - } - - enc->end = enc->start + _cbBuffer; - enc->offset = enc->start; - - locale = setlocale(LC_NUMERIC, NULL); - if (strcmp(locale, "C")) - { - locale = strdup(locale); - if (!locale) - { - SetError(NULL, enc, "Could not reserve memory block"); - return NULL; + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; } - setlocale(LC_NUMERIC, "C"); - encode (obj, enc, NULL, 0); - setlocale(LC_NUMERIC, locale); - free(locale); - } - else - { - encode (obj, enc, NULL, 0); - } - - Buffer_Reserve(enc, 1); - if (enc->errorMsg) - { - return NULL; - } - Buffer_AppendCharUnchecked(enc, '\0'); - - return enc->start; + + if (_buffer == NULL) { + _cbBuffer = 32768; + enc->start = (char *)enc->malloc(_cbBuffer); + if (!enc->start) { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; + } + enc->heap = 1; + } else { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) { + locale = strdup(locale); + if (!locale) { + SetError(NULL, enc, "Could not reserve memory block"); + return NULL; + } + setlocale(LC_NUMERIC, "C"); + encode(obj, enc, NULL, 0); + setlocale(LC_NUMERIC, locale); + free(locale); + } else { + encode(obj, enc, NULL, 0); + } + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; } diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c index e4d02db4cb60a..b0132532c16af 100644 --- a/pandas/src/ujson/python/JSONtoObj.c +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -35,38 +35,37 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// "py_defines.h" needs to be included first to +// avoid compilation errors, but it does violate +// styleguide checks with regards to include order. #include "py_defines.h" #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #define NO_IMPORT_ARRAY -#include -#include +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) - -//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) #define PRINTMARK() -typedef struct __PyObjectDecoder -{ - JSONObjectDecoder dec; +typedef struct __PyObjectDecoder { + JSONObjectDecoder dec; - void* npyarr; // Numpy context buffer - void* npyarr_addr; // Ref to npyarr ptr to track DECREF calls - npy_intp curdim; // Current array dimension + void *npyarr; // Numpy context buffer + void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls + npy_intp curdim; // Current array dimension - PyArray_Descr* dtype; + PyArray_Descr *dtype; } PyObjectDecoder; -typedef struct __NpyArrContext -{ - PyObject* ret; - PyObject* labels[2]; - PyArray_Dims shape; +typedef struct __NpyArrContext { + PyObject *ret; + PyObject *labels[2]; + PyArray_Dims shape; - PyObjectDecoder* dec; + PyObjectDecoder *dec; - npy_intp i; - npy_intp elsize; - npy_intp elcount; + npy_intp i; + npy_intp elsize; + npy_intp elcount; } NpyArrContext; // Numpy handling based on numpy internal code, specifically the function @@ -76,661 +75,564 @@ typedef struct __NpyArrContext // to ensure the compiler catches any errors // standard numpy array handling -JSOBJ Object_npyNewArray(void *prv, void* decoder); +JSOBJ Object_npyNewArray(void *prv, void *decoder); JSOBJ Object_npyEndArray(void *prv, JSOBJ obj); int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value); // for more complex dtypes (object and string) fill a standard Python list // and convert to a numpy array when done. -JSOBJ Object_npyNewArrayList(void *prv, void* decoder); +JSOBJ Object_npyNewArrayList(void *prv, void *decoder); JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj); int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value); // labelled support, encode keys and values of JS object into separate numpy // arrays -JSOBJ Object_npyNewObject(void *prv, void* decoder); +JSOBJ Object_npyNewObject(void *prv, void *decoder); JSOBJ Object_npyEndObject(void *prv, JSOBJ obj); int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); // free the numpy context buffer -void Npy_releaseContext(NpyArrContext* npyarr) -{ - PRINTMARK(); - if (npyarr) - { - if (npyarr->shape.ptr) - { - PyObject_Free(npyarr->shape.ptr); - } - if (npyarr->dec) - { - npyarr->dec->npyarr = NULL; - npyarr->dec->curdim = 0; - } - Py_XDECREF(npyarr->labels[0]); - Py_XDECREF(npyarr->labels[1]); - Py_XDECREF(npyarr->ret); - PyObject_Free(npyarr); - } +void Npy_releaseContext(NpyArrContext *npyarr) { + PRINTMARK(); + if (npyarr) { + if (npyarr->shape.ptr) { + PyObject_Free(npyarr->shape.ptr); + } + if (npyarr->dec) { + npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } } -JSOBJ Object_npyNewArray(void *prv, void* _decoder) -{ - NpyArrContext* npyarr; - PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - PRINTMARK(); - if (decoder->curdim <= 0) - { - // start of array - initialise the context buffer - npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - decoder->npyarr_addr = npyarr; - - if (!npyarr) - { - PyErr_NoMemory(); - return NULL; - } - - npyarr->dec = decoder; - npyarr->labels[0] = npyarr->labels[1] = NULL; - - npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp)*NPY_MAXDIMS); - npyarr->shape.len = 1; - npyarr->ret = NULL; - - npyarr->elsize = 0; - npyarr->elcount = 4; - npyarr->i = 0; - } - else - { - // starting a new dimension continue the current array (and reshape after) - npyarr = (NpyArrContext*) decoder->npyarr; - if (decoder->curdim >= npyarr->shape.len) - { - npyarr->shape.len++; - } - } - - npyarr->shape.ptr[decoder->curdim] = 0; - decoder->curdim++; - return npyarr; -} +JSOBJ Object_npyNewArray(void *prv, void *_decoder) { + NpyArrContext *npyarr; + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + PRINTMARK(); + if (decoder->curdim <= 0) { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + decoder->npyarr_addr = npyarr; + + if (!npyarr) { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } else { + // starting a new dimension continue the current array (and reshape + // after) + npyarr = (NpyArrContext *)decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) { + npyarr->shape.len++; + } + } -PyObject* Npy_returnLabelled(NpyArrContext* npyarr) -{ - PyObject* ret = npyarr->ret; - npy_intp i; - - if (npyarr->labels[0] || npyarr->labels[1]) - { - // finished decoding, build tuple with values and labels - ret = PyTuple_New(npyarr->shape.len+1); - for (i = 0; i < npyarr->shape.len; i++) - { - if (npyarr->labels[i]) - { - PyTuple_SET_ITEM(ret, i+1, npyarr->labels[i]); - npyarr->labels[i] = NULL; - } - else - { - Py_INCREF(Py_None); - PyTuple_SET_ITEM(ret, i+1, Py_None); - } - } - PyTuple_SET_ITEM(ret, 0, npyarr->ret); - } - - return ret; + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; } -JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) -{ - PyObject *ret; - char* new_data; - NpyArrContext* npyarr = (NpyArrContext*) obj; - int emptyType = NPY_DEFAULT_TYPE; - npy_intp i; - PRINTMARK(); - if (!npyarr) - { - return NULL; - } - - ret = npyarr->ret; - i = npyarr->i; - - npyarr->dec->curdim--; - - if (i == 0 || !npyarr->ret) { - // empty array would not have been initialised so do it now. - if (npyarr->dec->dtype) - { - emptyType = npyarr->dec->dtype->type_num; - } - npyarr->ret = ret = PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); - } - else if (npyarr->dec->curdim <= 0) - { - // realloc to final size - new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); - if (new_data == NULL) { - PyErr_NoMemory(); - Npy_releaseContext(npyarr); - return NULL; - } - ((PyArrayObject*) ret)->data = (void*) new_data; - // PyArray_BYTES(ret) = new_data; - } - - if (npyarr->dec->curdim <= 0) - { - // finished decoding array, reshape if necessary - if (npyarr->shape.len > 1) - { - npyarr->ret = PyArray_Newshape((PyArrayObject*) ret, &npyarr->shape, NPY_ANYORDER); - Py_DECREF(ret); +PyObject *Npy_returnLabelled(NpyArrContext *npyarr) { + PyObject *ret = npyarr->ret; + npy_intp i; + + if (npyarr->labels[0] || npyarr->labels[1]) { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len + 1); + for (i = 0; i < npyarr->shape.len; i++) { + if (npyarr->labels[i]) { + PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } else { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i + 1, Py_None); + } + } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); } - ret = Npy_returnLabelled(npyarr); - - npyarr->ret = NULL; - Npy_releaseContext(npyarr); - } - - return ret; + return ret; } -int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) -{ - PyObject* type; - PyArray_Descr* dtype; - npy_intp i; - char *new_data, *item; - NpyArrContext* npyarr = (NpyArrContext*) obj; - PRINTMARK(); - if (!npyarr) - { - return 0; - } +JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) { + PyObject *ret; + char *new_data; + NpyArrContext *npyarr = (NpyArrContext *)obj; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i; + PRINTMARK(); + if (!npyarr) { + return NULL; + } - i = npyarr->i; + ret = npyarr->ret; + i = npyarr->i; + + npyarr->dec->curdim--; + + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = + PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } else if (npyarr->dec->curdim <= 0) { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((PyArrayObject *)ret)->data = (void *)new_data; + // PyArray_BYTES(ret) = new_data; + } - npyarr->shape.ptr[npyarr->dec->curdim-1]++; + if (npyarr->dec->curdim <= 0) { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) { + npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape, + NPY_ANYORDER); + Py_DECREF(ret); + } - if (PyArray_Check((PyObject*)value)) - { - // multidimensional array, keep decoding values. - return 1; - } - - if (!npyarr->ret) - { - // Array not initialised yet. - // We do it here so we can 'sniff' the data type if none was provided - if (!npyarr->dec->dtype) - { - type = PyObject_Type(value); - if(!PyArray_DescrConverter(type, &dtype)) - { - Py_DECREF(type); - goto fail; - } - Py_INCREF(dtype); - Py_DECREF(type); - } - else - { - dtype = PyArray_DescrNew(npyarr->dec->dtype); - } - - // If it's an object or string then fill a Python list and subsequently - // convert. Otherwise we would need to somehow mess about with - // reference counts when renewing memory. - npyarr->elsize = dtype->elsize; - if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) - { - Py_XDECREF(dtype); - - if (npyarr->dec->curdim > 1) - { - PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); - goto fail; - } - npyarr->elcount = 0; - npyarr->ret = PyList_New(0); - if (!npyarr->ret) - { - goto fail; - } - ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArrayList; - ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayListAddItem; - ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArrayList; - return Object_npyArrayListAddItem(prv, obj, value); + ret = Npy_returnLabelled(npyarr); + + npyarr->ret = NULL; + Npy_releaseContext(npyarr); } - npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, - &npyarr->elcount, NULL,NULL, 0, NULL); + return ret; +} - if (!npyarr->ret) - { - goto fail; +int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { + PyObject *type; + PyArray_Descr *dtype; + npy_intp i; + char *new_data, *item; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return 0; } - } - if (i >= npyarr->elcount) { - // Grow PyArray_DATA(ret): - // this is similar for the strategy for PyListObject, but we use - // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... - if (npyarr->elsize == 0) - { - PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); - goto fail; - } + i = npyarr->i; - npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; - if (npyarr->elcount <= NPY_MAX_INTP/npyarr->elsize) { - new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), npyarr->elcount * npyarr->elsize); + npyarr->shape.ptr[npyarr->dec->curdim - 1]++; + + if (PyArray_Check((PyObject *)value)) { + // multidimensional array, keep decoding values. + return 1; } - else { - PyErr_NoMemory(); - goto fail; + + if (!npyarr->ret) { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) { + type = PyObject_Type(value); + if (!PyArray_DescrConverter(type, &dtype)) { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } else { + dtype = PyArray_DescrNew(npyarr->dec->dtype); + } + + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) { + PyErr_SetString(PyExc_ValueError, + "Cannot decode multidimensional arrays with " + "variable length elements to numpy"); + goto fail; + } + npyarr->elcount = 0; + npyarr->ret = PyList_New(0); + if (!npyarr->ret) { + goto fail; + } + ((JSONObjectDecoder *)npyarr->dec)->newArray = + Object_npyNewArrayList; + ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = + Object_npyArrayListAddItem; + ((JSONObjectDecoder *)npyarr->dec)->endArray = + Object_npyEndArrayList; + return Object_npyArrayListAddItem(prv, obj, value); + } + + npyarr->ret = PyArray_NewFromDescr( + &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL); + + if (!npyarr->ret) { + goto fail; + } } - ((PyArrayObject*) npyarr->ret)->data = (void*) new_data; - // PyArray_BYTES(npyarr->ret) = new_data; - } + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) { + PyErr_SetString(PyExc_ValueError, + "Cannot decode multidimensional arrays with " + "variable length elements to numpy"); + goto fail; + } + + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), + npyarr->elcount * npyarr->elsize); + } else { + PyErr_NoMemory(); + goto fail; + } + ((PyArrayObject *)npyarr->ret)->data = (void *)new_data; + + // PyArray_BYTES(npyarr->ret) = new_data; + } - PyArray_DIMS(npyarr->ret)[0] = i + 1; + PyArray_DIMS(npyarr->ret)[0] = i + 1; - if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL - || PyArray_SETITEM(npyarr->ret, item, value) == -1) { - goto fail; - } + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL || + PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } - Py_DECREF( (PyObject *) value); - npyarr->i++; - return 1; + Py_DECREF((PyObject *)value); + npyarr->i++; + return 1; fail: - Npy_releaseContext(npyarr); - return 0; + Npy_releaseContext(npyarr); + return 0; } -JSOBJ Object_npyNewArrayList(void *prv, void* _decoder) -{ - PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - PRINTMARK(); - PyErr_SetString(PyExc_ValueError, "nesting not supported for object or variable length dtypes"); - Npy_releaseContext(decoder->npyarr); - return NULL; +JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) { + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + PRINTMARK(); + PyErr_SetString( + PyExc_ValueError, + "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; } -JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) -{ - PyObject *list, *ret; - NpyArrContext* npyarr = (NpyArrContext*) obj; - PRINTMARK(); - if (!npyarr) - { - return NULL; - } +JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) { + PyObject *list, *ret; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return NULL; + } - // convert decoded list to numpy array - list = (PyObject *) npyarr->ret; - npyarr->ret = PyArray_FROM_O(list); + // convert decoded list to numpy array + list = (PyObject *)npyarr->ret; + npyarr->ret = PyArray_FROM_O(list); - ret = Npy_returnLabelled(npyarr); - npyarr->ret = list; + ret = Npy_returnLabelled(npyarr); + npyarr->ret = list; - ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArray; - ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; - ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArray; - Npy_releaseContext(npyarr); - return ret; + ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; } -int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) -{ - NpyArrContext* npyarr = (NpyArrContext*) obj; - PRINTMARK(); - if (!npyarr) - { - return 0; - } - PyList_Append((PyObject*) npyarr->ret, value); - Py_DECREF( (PyObject *) value); - npyarr->elcount++; - return 1; +int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) { + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return 0; + } + PyList_Append((PyObject *)npyarr->ret, value); + Py_DECREF((PyObject *)value); + npyarr->elcount++; + return 1; } +JSOBJ Object_npyNewObject(void *prv, void *_decoder) { + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + PRINTMARK(); + if (decoder->curdim > 1) { + PyErr_SetString(PyExc_ValueError, + "labels only supported up to 2 dimensions"); + return NULL; + } -JSOBJ Object_npyNewObject(void *prv, void* _decoder) -{ - PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - PRINTMARK(); - if (decoder->curdim > 1) - { - PyErr_SetString(PyExc_ValueError, "labels only supported up to 2 dimensions"); - return NULL; - } - - return ((JSONObjectDecoder*)decoder)->newArray(prv, decoder); + return ((JSONObjectDecoder *)decoder)->newArray(prv, decoder); } -JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) -{ - PyObject *list; - npy_intp labelidx; - NpyArrContext* npyarr = (NpyArrContext*) obj; - PRINTMARK(); - if (!npyarr) - { - return NULL; - } +JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) { + PyObject *list; + npy_intp labelidx; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return NULL; + } - labelidx = npyarr->dec->curdim-1; + labelidx = npyarr->dec->curdim - 1; - list = npyarr->labels[labelidx]; - if (list) - { - npyarr->labels[labelidx] = PyArray_FROM_O(list); - Py_DECREF(list); - } + list = npyarr->labels[labelidx]; + if (list) { + npyarr->labels[labelidx] = PyArray_FROM_O(list); + Py_DECREF(list); + } - return (PyObject*) ((JSONObjectDecoder*)npyarr->dec)->endArray(prv, obj); + return (PyObject *)((JSONObjectDecoder *)npyarr->dec)->endArray(prv, obj); } -int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) -{ - PyObject *label; - npy_intp labelidx; - // add key to label array, value to values array - NpyArrContext* npyarr = (NpyArrContext*) obj; - PRINTMARK(); - if (!npyarr) - { +int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { + PyObject *label; + npy_intp labelidx; + // add key to label array, value to values array + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return 0; + } + + label = (PyObject *)name; + labelidx = npyarr->dec->curdim - 1; + + if (!npyarr->labels[labelidx]) { + npyarr->labels[labelidx] = PyList_New(0); + } + + // only fill label array once, assumes all column labels are the same + // for 2-dimensional arrays. + if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) { + PyList_Append(npyarr->labels[labelidx], label); + } + + if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) { + Py_DECREF(label); + return 1; + } return 0; - } - - label = (PyObject*) name; - labelidx = npyarr->dec->curdim-1; - - if (!npyarr->labels[labelidx]) - { - npyarr->labels[labelidx] = PyList_New(0); - } - - // only fill label array once, assumes all column labels are the same - // for 2-dimensional arrays. - if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) - { - PyList_Append(npyarr->labels[labelidx], label); - } - - if(((JSONObjectDecoder*)npyarr->dec)->arrayAddItem(prv, obj, value)) - { - Py_DECREF(label); - return 1; - } - return 0; } -int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) -{ - PyDict_SetItem (obj, name, value); - Py_DECREF( (PyObject *) name); - Py_DECREF( (PyObject *) value); - return 1; +int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { + PyDict_SetItem(obj, name, value); + Py_DECREF((PyObject *)name); + Py_DECREF((PyObject *)value); + return 1; } -int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) -{ - PyList_Append(obj, value); - Py_DECREF( (PyObject *) value); - return 1; +int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { + PyList_Append(obj, value); + Py_DECREF((PyObject *)value); + return 1; } -JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) -{ - return PyUnicode_FromWideChar (start, (end - start)); +JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { + return PyUnicode_FromWideChar(start, (end - start)); } -JSOBJ Object_newTrue(void *prv) -{ - Py_RETURN_TRUE; -} +JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; } -JSOBJ Object_newFalse(void *prv) -{ - Py_RETURN_FALSE; -} +JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } -JSOBJ Object_newNull(void *prv) -{ - Py_RETURN_NONE; -} +JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } -JSOBJ Object_newObject(void *prv, void* decoder) -{ - return PyDict_New(); -} +JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } -JSOBJ Object_endObject(void *prv, JSOBJ obj) -{ - return obj; -} +JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } -JSOBJ Object_newArray(void *prv, void* decoder) -{ - return PyList_New(0); -} +JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); } -JSOBJ Object_endArray(void *prv, JSOBJ obj) -{ - return obj; -} +JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; } -JSOBJ Object_newInteger(void *prv, JSINT32 value) -{ - return PyInt_FromLong( (long) value); +JSOBJ Object_newInteger(void *prv, JSINT32 value) { + return PyInt_FromLong((long)value); } -JSOBJ Object_newLong(void *prv, JSINT64 value) -{ - return PyLong_FromLongLong (value); +JSOBJ Object_newLong(void *prv, JSINT64 value) { + return PyLong_FromLongLong(value); } -JSOBJ Object_newDouble(void *prv, double value) -{ - return PyFloat_FromDouble(value); +JSOBJ Object_newDouble(void *prv, double value) { + return PyFloat_FromDouble(value); } -static void Object_releaseObject(void *prv, JSOBJ obj, void* _decoder) -{ - PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - if (obj != decoder->npyarr_addr) - { - Py_XDECREF( ((PyObject *)obj)); - } +static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + if (obj != decoder->npyarr_addr) { + Py_XDECREF(((PyObject *)obj)); + } } -static char *g_kwlist[] = {"obj", "precise_float", "numpy", "labelled", "dtype", NULL}; - -PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) -{ - PyObject *ret; - PyObject *sarg; - PyObject *arg; - PyObject *opreciseFloat = NULL; - JSONObjectDecoder *decoder; - PyObjectDecoder pyDecoder; - PyArray_Descr *dtype = NULL; - int numpy = 0, labelled = 0; - - JSONObjectDecoder dec = - { - Object_newString, - Object_objectAddKey, - Object_arrayAddItem, - Object_newTrue, - Object_newFalse, - Object_newNull, - Object_newObject, - Object_endObject, - Object_newArray, - Object_endArray, - Object_newInteger, - Object_newLong, - Object_newDouble, - Object_releaseObject, - PyObject_Malloc, - PyObject_Free, - PyObject_Realloc - }; - - dec.preciseFloat = 0; - dec.prv = NULL; - - pyDecoder.dec = dec; - pyDecoder.curdim = 0; - pyDecoder.npyarr = NULL; - pyDecoder.npyarr_addr = NULL; - - decoder = (JSONObjectDecoder*) &pyDecoder; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, &opreciseFloat, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) - { - Npy_releaseContext(pyDecoder.npyarr); - return NULL; - } - - if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) - { - decoder->preciseFloat = 1; - } - - if (PyString_Check(arg)) - { - sarg = arg; - } - else - if (PyUnicode_Check(arg)) - { - sarg = PyUnicode_AsUTF8String(arg); - if (sarg == NULL) - { - //Exception raised above us by codec according to docs - return NULL; - } - } - else - { - PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); - return NULL; - } - - decoder->errorStr = NULL; - decoder->errorOffset = NULL; +static char *g_kwlist[] = {"obj", "precise_float", "numpy", + "labelled", "dtype", NULL}; + +PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { + PyObject *ret; + PyObject *sarg; + PyObject *arg; + PyObject *opreciseFloat = NULL; + JSONObjectDecoder *decoder; + PyObjectDecoder pyDecoder; + PyArray_Descr *dtype = NULL; + int numpy = 0, labelled = 0; + + JSONObjectDecoder dec = { + Object_newString, Object_objectAddKey, Object_arrayAddItem, + Object_newTrue, Object_newFalse, Object_newNull, + Object_newObject, Object_endObject, Object_newArray, + Object_endArray, Object_newInteger, Object_newLong, + Object_newDouble, Object_releaseObject, PyObject_Malloc, + PyObject_Free, PyObject_Realloc}; + + dec.preciseFloat = 0; + dec.prv = NULL; + + pyDecoder.dec = dec; + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + pyDecoder.npyarr_addr = NULL; + + decoder = (JSONObjectDecoder *)&pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, + &opreciseFloat, &numpy, &labelled, + PyArray_DescrConverter2, &dtype)) { + Npy_releaseContext(pyDecoder.npyarr); + return NULL; + } - if (numpy) - { - pyDecoder.dtype = dtype; - decoder->newArray = Object_npyNewArray; - decoder->endArray = Object_npyEndArray; - decoder->arrayAddItem = Object_npyArrayAddItem; + if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) { + decoder->preciseFloat = 1; + } - if (labelled) - { - decoder->newObject = Object_npyNewObject; - decoder->endObject = Object_npyEndObject; - decoder->objectAddKey = Object_npyObjectAddKey; + if (PyString_Check(arg)) { + sarg = arg; + } else if (PyUnicode_Check(arg)) { + sarg = PyUnicode_AsUTF8String(arg); + if (sarg == NULL) { + // Exception raised above us by codec according to docs + return NULL; + } + } else { + PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); + return NULL; } - } - ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), PyString_GET_SIZE(sarg)); + decoder->errorStr = NULL; + decoder->errorOffset = NULL; - if (sarg != arg) - { - Py_DECREF(sarg); - } + if (numpy) { + pyDecoder.dtype = dtype; + decoder->newArray = Object_npyNewArray; + decoder->endArray = Object_npyEndArray; + decoder->arrayAddItem = Object_npyArrayAddItem; - if (PyErr_Occurred()) - { - if (ret) - { - Py_DECREF( (PyObject *) ret); + if (labelled) { + decoder->newObject = Object_npyNewObject; + decoder->endObject = Object_npyEndObject; + decoder->objectAddKey = Object_npyObjectAddKey; + } } - Npy_releaseContext(pyDecoder.npyarr); - return NULL; - } - if (decoder->errorStr) - { - /* - FIXME: It's possible to give a much nicer error message here with actual failing element in input etc*/ + ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), + PyString_GET_SIZE(sarg)); - PyErr_Format (PyExc_ValueError, "%s", decoder->errorStr); + if (sarg != arg) { + Py_DECREF(sarg); + } - if (ret) - { - Py_DECREF( (PyObject *) ret); + if (PyErr_Occurred()) { + if (ret) { + Py_DECREF((PyObject *)ret); + } + Npy_releaseContext(pyDecoder.npyarr); + return NULL; } - Npy_releaseContext(pyDecoder.npyarr); - return NULL; - } + if (decoder->errorStr) { + /* + FIXME: It's possible to give a much nicer error message here with actual + failing element in input etc*/ + + PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr); - return ret; + if (ret) { + Py_DECREF((PyObject *)ret); + } + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } + + return ret; } -PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs) -{ - PyObject *read; - PyObject *string; - PyObject *result; - PyObject *file = NULL; - PyObject *argtuple; +PyObject *JSONFileToObj(PyObject *self, PyObject *args, PyObject *kwargs) { + PyObject *read; + PyObject *string; + PyObject *result; + PyObject *file = NULL; + PyObject *argtuple; - if (!PyArg_ParseTuple (args, "O", &file)) - { - return NULL; - } + if (!PyArg_ParseTuple(args, "O", &file)) { + return NULL; + } - if (!PyObject_HasAttrString (file, "read")) - { - PyErr_Format (PyExc_TypeError, "expected file"); - return NULL; - } + if (!PyObject_HasAttrString(file, "read")) { + PyErr_Format(PyExc_TypeError, "expected file"); + return NULL; + } - read = PyObject_GetAttrString (file, "read"); + read = PyObject_GetAttrString(file, "read"); - if (!PyCallable_Check (read)) { - Py_XDECREF(read); - PyErr_Format (PyExc_TypeError, "expected file"); - return NULL; - } + if (!PyCallable_Check(read)) { + Py_XDECREF(read); + PyErr_Format(PyExc_TypeError, "expected file"); + return NULL; + } - string = PyObject_CallObject (read, NULL); - Py_XDECREF(read); + string = PyObject_CallObject(read, NULL); + Py_XDECREF(read); - if (string == NULL) - { - return NULL; - } + if (string == NULL) { + return NULL; + } - argtuple = PyTuple_Pack(1, string); + argtuple = PyTuple_Pack(1, string); - result = JSONToObj (self, argtuple, kwargs); + result = JSONToObj(self, argtuple, kwargs); - Py_XDECREF(argtuple); - Py_XDECREF(string); + Py_XDECREF(argtuple); + Py_XDECREF(string); - if (result == NULL) { - return NULL; - } + if (result == NULL) { + return NULL; + } - return result; + return result; } diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 75de63acbd7d6..42c0b62a57511 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -36,105 +36,104 @@ Numeric decoder derived from from TCL library */ #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY -#include "py_defines.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static PyObject* type_decimal; +// "py_defines.h" needs to be included first to +// avoid compilation errors, but it does violate +// styleguide checks with regards to include order. +#include "py_defines.h" // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) + +static PyObject *type_decimal; #define NPY_JSON_BUFSIZE 32768 -static PyTypeObject* cls_dataframe; -static PyTypeObject* cls_series; -static PyTypeObject* cls_index; -static PyTypeObject* cls_nat; +static PyTypeObject *cls_dataframe; +static PyTypeObject *cls_series; +static PyTypeObject *cls_index; +static PyTypeObject *cls_nat; -typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); +typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, + void *outValue, size_t *_outLen); #if (PY_VERSION_HEX < 0x02050000) typedef ssize_t Py_ssize_t; #endif +typedef struct __NpyArrContext { + PyObject *array; + char *dataptr; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + int type_num; + PyArray_GetItemFunc *getitem; -typedef struct __NpyArrContext -{ - PyObject *array; - char* dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) - npy_intp dim; - npy_intp stride; - npy_intp ndim; - npy_intp index[NPY_MAXDIMS]; - int type_num; - PyArray_GetItemFunc* getitem; - - char** rowLabels; - char** columnLabels; + char **rowLabels; + char **columnLabels; } NpyArrContext; -typedef struct __PdBlockContext -{ - int colIdx; - int ncols; - int transpose; +typedef struct __PdBlockContext { + int colIdx; + int ncols; + int transpose; - int* cindices; // frame column -> block column map - NpyArrContext** npyCtxts; // NpyArrContext for each column + int *cindices; // frame column -> block column map + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; -typedef struct __TypeContext -{ - JSPFN_ITERBEGIN iterBegin; - JSPFN_ITEREND iterEnd; - JSPFN_ITERNEXT iterNext; - JSPFN_ITERGETNAME iterGetName; - JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToJSON PyTypeToJSON; - PyObject *newObj; - PyObject *dictObj; - Py_ssize_t index; - Py_ssize_t size; - PyObject *itemValue; - PyObject *itemName; - PyObject *attrList; - PyObject *iterator; - - double doubleValue; - JSINT64 longValue; - - char *cStr; - NpyArrContext *npyarr; - PdBlockContext *pdblock; - int transpose; - char** rowLabels; - char** columnLabels; - npy_intp rowLabelsLen; - npy_intp columnLabelsLen; +typedef struct __TypeContext { + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToJSON PyTypeToJSON; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + PyObject *iterator; + + double doubleValue; + JSINT64 longValue; + + char *cStr; + NpyArrContext *npyarr; + PdBlockContext *pdblock; + int transpose; + char **rowLabels; + char **columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; } TypeContext; -typedef struct __PyObjectEncoder -{ +typedef struct __PyObjectEncoder { JSONObjectEncoder enc; // pass through the NpyArrContext when encoding multi-dimensional arrays - NpyArrContext* npyCtxtPassthru; + NpyArrContext *npyCtxtPassthru; // pass through the PdBlockContext when encoding blocks - PdBlockContext* blkCtxtPassthru; + PdBlockContext *blkCtxtPassthru; // pass-through to encode numpy data directly int npyType; - void* npyValue; + void *npyValue; TypeContext basicTypeContext; int datetimeIso; @@ -149,16 +148,8 @@ typedef struct __PyObjectEncoder #define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) -enum PANDAS_FORMAT -{ - SPLIT, - RECORDS, - INDEX, - COLUMNS, - VALUES -}; - -//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; + #define PRINTMARK() int PdBlock_iterNext(JSOBJ, JSONTypeContext *); @@ -166,709 +157,624 @@ int PdBlock_iterNext(JSOBJ, JSONTypeContext *); // import_array() compat #if (PY_VERSION_HEX >= 0x03000000) void *initObjToJSON(void) - #else void initObjToJSON(void) #endif { - PyObject *mod_pandas; - PyObject *mod_tslib; - PyObject* mod_decimal = PyImport_ImportModule("decimal"); - type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); - Py_INCREF(type_decimal); - Py_DECREF(mod_decimal); - - PyDateTime_IMPORT; - - mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas) - { - cls_dataframe = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "DataFrame"); - cls_index = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Index"); - cls_series = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Series"); - Py_DECREF(mod_pandas); - } - - mod_tslib = PyImport_ImportModule("pandas.tslib"); - if (mod_tslib) - { - cls_nat = (PyTypeObject*) PyObject_GetAttrString(mod_tslib, "NaTType"); - Py_DECREF(mod_tslib); - } - - /* Initialise numpy API and use 2/3 compatible return */ - import_array(); - return NUMPY_IMPORT_ARRAY_RETVAL; -} - -static TypeContext* createTypeContext(void) -{ - TypeContext *pc; - - pc = PyObject_Malloc(sizeof(TypeContext)); - if (!pc) - { - PyErr_NoMemory(); - return NULL; - } - pc->newObj = NULL; - pc->dictObj = NULL; - pc->itemValue = NULL; - pc->itemName = NULL; - pc->attrList = NULL; - pc->index = 0; - pc->size = 0; - pc->longValue = 0; - pc->doubleValue = 0.0; - pc->cStr = NULL; - pc->npyarr = NULL; - pc->pdblock = NULL; - pc->rowLabels = NULL; - pc->columnLabels = NULL; - pc->transpose = 0; - pc->rowLabelsLen = 0; - pc->columnLabelsLen = 0; - - return pc; -} - -static PyObject* get_values(PyObject *obj) -{ - PyObject *values = PyObject_GetAttrString(obj, "values"); - PRINTMARK(); + PyObject *mod_pandas; + PyObject *mod_tslib; + PyObject *mod_decimal = PyImport_ImportModule("decimal"); + type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); + Py_INCREF(type_decimal); + Py_DECREF(mod_decimal); - if (values && !PyArray_CheckExact(values)) - { - if (PyObject_HasAttrString(values, "values")) - { - PyObject *subvals = get_values(values); - PyErr_Clear(); - PRINTMARK(); - // subvals are sometimes missing a dimension - if (subvals) - { - PyArrayObject *reshape = (PyArrayObject*) subvals; - PyObject *shape = PyObject_GetAttrString(obj, "shape"); - PyArray_Dims dims; - PRINTMARK(); + PyDateTime_IMPORT; - if (!shape || !PyArray_IntpConverter(shape, &dims)) - { - subvals = NULL; - } - else - { - subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); - PyDimMem_FREE(dims.ptr); - } - Py_DECREF(reshape); - Py_XDECREF(shape); - } - Py_DECREF(values); - values = subvals; - } - else - { - PRINTMARK(); - Py_DECREF(values); - values = NULL; + mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) { + cls_dataframe = + (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "DataFrame"); + cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); + cls_series = + (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + Py_DECREF(mod_pandas); } - } - if (!values && PyObject_HasAttrString(obj, "get_values")) - { - PRINTMARK(); - values = PyObject_CallMethod(obj, "get_values", NULL); - if (values && !PyArray_CheckExact(values)) - { - PRINTMARK(); - Py_DECREF(values); - values = NULL; + mod_tslib = PyImport_ImportModule("pandas.tslib"); + if (mod_tslib) { + cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_tslib, "NaTType"); + Py_DECREF(mod_tslib); } - } - if (!values) - { - PyObject *typeRepr = PyObject_Repr((PyObject*) Py_TYPE(obj)); - PyObject *repr; + /* Initialise numpy API and use 2/3 compatible return */ + import_array(); + return NUMPY_IMPORT_ARRAY_RETVAL; +} + +static TypeContext *createTypeContext(void) { + TypeContext *pc; + + pc = PyObject_Malloc(sizeof(TypeContext)); + if (!pc) { + PyErr_NoMemory(); + return NULL; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->doubleValue = 0.0; + pc->cStr = NULL; + pc->npyarr = NULL; + pc->pdblock = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + return pc; +} + +static PyObject *get_values(PyObject *obj) { + PyObject *values = PyObject_GetAttrString(obj, "values"); PRINTMARK(); - if (PyObject_HasAttrString(obj, "dtype")) - { - PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); - repr = PyObject_Repr(dtype); - Py_DECREF(dtype); + + if (values && !PyArray_CheckExact(values)) { + if (PyObject_HasAttrString(values, "values")) { + PyObject *subvals = get_values(values); + PyErr_Clear(); + PRINTMARK(); + // subvals are sometimes missing a dimension + if (subvals) { + PyArrayObject *reshape = (PyArrayObject *)subvals; + PyObject *shape = PyObject_GetAttrString(obj, "shape"); + PyArray_Dims dims; + PRINTMARK(); + + if (!shape || !PyArray_IntpConverter(shape, &dims)) { + subvals = NULL; + } else { + subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); + PyDimMem_FREE(dims.ptr); + } + Py_DECREF(reshape); + Py_XDECREF(shape); + } + Py_DECREF(values); + values = subvals; + } else { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } } - else - { - repr = PyString_FromString(""); + + if (!values && PyObject_HasAttrString(obj, "get_values")) { + PRINTMARK(); + values = PyObject_CallMethod(obj, "get_values", NULL); + if (values && !PyArray_CheckExact(values)) { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } } - PyErr_Format(PyExc_ValueError, - "%s or %s are not JSON serializable yet", - PyString_AS_STRING(repr), - PyString_AS_STRING(typeRepr)); - Py_DECREF(repr); - Py_DECREF(typeRepr); + if (!values) { + PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); + PyObject *repr; + PRINTMARK(); + if (PyObject_HasAttrString(obj, "dtype")) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + repr = PyObject_Repr(dtype); + Py_DECREF(dtype); + } else { + repr = PyString_FromString(""); + } - return NULL; - } + PyErr_Format(PyExc_ValueError, "%s or %s are not JSON serializable yet", + PyString_AS_STRING(repr), PyString_AS_STRING(typeRepr)); + Py_DECREF(repr); + Py_DECREF(typeRepr); - return values; + return NULL; + } + + return values; } -static PyObject* get_sub_attr(PyObject *obj, char *attr, char *subAttr) -{ - PyObject *tmp = PyObject_GetAttrString(obj, attr); - PyObject *ret; +static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { + PyObject *tmp = PyObject_GetAttrString(obj, attr); + PyObject *ret; - if (tmp == 0) - { - return 0; - } - ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); + if (tmp == 0) { + return 0; + } + ret = PyObject_GetAttrString(tmp, subAttr); + Py_DECREF(tmp); - return ret; + return ret; } -static int is_simple_frame(PyObject *obj) -{ - PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type"); - int ret = (check == Py_False); +static int is_simple_frame(PyObject *obj) { + PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type"); + int ret = (check == Py_False); - if (!check) - { - return 0; - } + if (!check) { + return 0; + } - Py_DECREF(check); - return ret; + Py_DECREF(check); + return ret; } -static Py_ssize_t get_attr_length(PyObject *obj, char *attr) -{ - PyObject *tmp = PyObject_GetAttrString(obj, attr); - Py_ssize_t ret; +static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { + PyObject *tmp = PyObject_GetAttrString(obj, attr); + Py_ssize_t ret; - if (tmp == 0) - { - return 0; - } - ret = PyObject_Length(tmp); - Py_DECREF(tmp); + if (tmp == 0) { + return 0; + } + ret = PyObject_Length(tmp); + Py_DECREF(tmp); - if (ret == -1) - { - return 0; - } + if (ret == -1) { + return 0; + } - return ret; + return ret; } -static PyObject* get_item(PyObject *obj, Py_ssize_t i) -{ - PyObject *tmp = PyInt_FromSsize_t(i); - PyObject *ret; +static PyObject *get_item(PyObject *obj, Py_ssize_t i) { + PyObject *tmp = PyInt_FromSsize_t(i); + PyObject *ret; - if (tmp == 0) - { - return 0; - } - ret = PyObject_GetItem(obj, tmp); - Py_DECREF(tmp); + if (tmp == 0) { + return 0; + } + ret = PyObject_GetItem(obj, tmp); + Py_DECREF(tmp); - return ret; + return ret; } -static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PRINTMARK(); - *((double *) outValue) = GET_TC(tc)->doubleValue; - return NULL; +static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PRINTMARK(); + *((double *)outValue) = GET_TC(tc)->doubleValue; + return NULL; } -static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PRINTMARK(); - *((JSINT64 *) outValue) = GET_TC(tc)->longValue; - return NULL; +static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PRINTMARK(); + *((JSINT64 *)outValue) = GET_TC(tc)->longValue; + return NULL; } #ifdef _LP64 -static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - *((JSINT64 *) outValue) = PyInt_AS_LONG (obj); - return NULL; +static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + *((JSINT64 *)outValue) = PyInt_AS_LONG(obj); + return NULL; } #else -static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - *((JSINT32 *) outValue) = PyInt_AS_LONG (obj); - return NULL; +static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + *((JSINT32 *)outValue) = PyInt_AS_LONG(obj); + return NULL; } #endif -static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - *((JSINT64 *) outValue) = GET_TC(tc)->longValue; - return NULL; +static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + *((JSINT64 *)outValue) = GET_TC(tc)->longValue; + return NULL; } -static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); - return NULL; +static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; } -static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - *((double *) outValue) = PyFloat_AsDouble (obj); - return NULL; +static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + *((double *)outValue) = PyFloat_AsDouble(obj); + return NULL; } -static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - *_outLen = PyString_GET_SIZE(obj); - return PyString_AS_STRING(obj); +static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + *_outLen = PyString_GET_SIZE(obj); + return PyString_AS_STRING(obj); } -static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PyObject *obj = (PyObject *) _obj; - PyObject *newObj = PyUnicode_EncodeUTF8 (PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), NULL); +static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PyObject *obj = (PyObject *)_obj; + PyObject *newObj = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), + PyUnicode_GET_SIZE(obj), NULL); - GET_TC(tc)->newObj = newObj; + GET_TC(tc)->newObj = newObj; - *_outLen = PyString_GET_SIZE(newObj); - return PyString_AS_STRING(newObj); + *_outLen = PyString_GET_SIZE(newObj); + return PyString_AS_STRING(newObj); } -static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - PANDAS_DATETIMEUNIT base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; +static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, + JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + PANDAS_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (((PyObjectEncoder*) tc->encoder)->datetimeIso) - { - PRINTMARK(); - *_outLen = (size_t) get_datetime_iso_8601_strlen(0, base); - GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); - if (!GET_TC(tc)->cStr) - { - PyErr_NoMemory(); - ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; - return NULL; - } + if (((PyObjectEncoder *)tc->encoder)->datetimeIso) { + PRINTMARK(); + *_outLen = (size_t)get_datetime_iso_8601_strlen(0, base); + GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, -1, NPY_UNSAFE_CASTING)) - { - PRINTMARK(); - *_outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, + -1, NPY_UNSAFE_CASTING)) { + PRINTMARK(); + *_outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; + } else { + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + PyObject_Free(GET_TC(tc)->cStr); + return NULL; + } + } else { + PRINTMARK(); + *((JSINT64 *)outValue) = pandas_datetimestruct_to_datetime(base, dts); + return NULL; } - else - { - PRINTMARK(); - PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); - ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; - PyObject_Free(GET_TC(tc)->cStr); - return NULL; - } - } - else - { - PRINTMARK(); - *((JSINT64*)outValue) = pandas_datetimestruct_to_datetime(base, dts); - return NULL; - } } -static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - pandas_datetimestruct dts; - PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; - PRINTMARK(); +static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, + void *outValue, size_t *_outLen) { + pandas_datetimestruct dts; + PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *)_obj; + PRINTMARK(); - pandas_datetime_to_datetimestruct(obj->obval, (PANDAS_DATETIMEUNIT)obj->obmeta.base, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + pandas_datetime_to_datetimestruct( + obj->obval, (PANDAS_DATETIMEUNIT)obj->obmeta.base, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } -static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - pandas_datetimestruct dts; - PyObject *obj = (PyObject *) _obj; - - PRINTMARK(); +static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *_outLen) { + pandas_datetimestruct dts; + PyObject *obj = (PyObject *)_obj; - if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) - { PRINTMARK(); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); - } - else - { - if (!PyErr_Occurred()) - { - PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + + if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) { + PRINTMARK(); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + } else { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + } + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; } - ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; - return NULL; - } } -static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) -{ - pandas_datetimestruct dts; - PRINTMARK(); +static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, + void *outValue, size_t *_outLen) { + pandas_datetimestruct dts; + PRINTMARK(); - pandas_datetime_to_datetimestruct( - (npy_datetime) GET_TC(tc)->longValue, - PANDAS_FR_ns, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, + PANDAS_FR_ns, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } -static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *outLen) -{ - PyObject *obj = (PyObject *) _obj; - PyObject *str; - PyObject *tmp; +static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, + size_t *outLen) { + PyObject *obj = (PyObject *)_obj; + PyObject *str; + PyObject *tmp; - str = PyObject_CallMethod(obj, "isoformat", NULL); - if (str == NULL) { - PRINTMARK(); - *outLen = 0; - if (!PyErr_Occurred()) - { - PyErr_SetString(PyExc_ValueError, "Failed to convert time"); + str = PyObject_CallMethod(obj, "isoformat", NULL); + if (str == NULL) { + PRINTMARK(); + *outLen = 0; + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, "Failed to convert time"); + } + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + if (PyUnicode_Check(str)) { + tmp = str; + str = PyUnicode_AsUTF8String(str); + Py_DECREF(tmp); } - ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; - return NULL; - } - if (PyUnicode_Check(str)) - { - tmp = str; - str = PyUnicode_AsUTF8String(str); - Py_DECREF(tmp); - } - GET_TC(tc)->newObj = str; + GET_TC(tc)->newObj = str; - *outLen = PyString_GET_SIZE(str); - outValue = (void *) PyString_AS_STRING (str); - return outValue; + *outLen = PyString_GET_SIZE(str); + outValue = (void *)PyString_AS_STRING(str); + return outValue; } -static int NpyTypeToJSONType(PyObject* obj, JSONTypeContext* tc, int npyType, void* value) -{ - PyArray_VectorUnaryFunc* castfunc; - npy_double doubleVal; - npy_int64 longVal; +static int NpyTypeToJSONType(PyObject *obj, JSONTypeContext *tc, int npyType, + void *value) { + PyArray_VectorUnaryFunc *castfunc; + npy_double doubleVal; + npy_int64 longVal; - if (PyTypeNum_ISFLOAT(npyType)) - { - PRINTMARK(); - castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_DOUBLE); - if (!castfunc) - { - PyErr_Format ( - PyExc_ValueError, - "Cannot cast numpy dtype %d to double", - npyType); + if (PyTypeNum_ISFLOAT(npyType)) { + PRINTMARK(); + castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_DOUBLE); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to double", npyType); + } + castfunc(value, &doubleVal, 1, NULL, NULL); + if (npy_isnan(doubleVal) || npy_isinf(doubleVal)) { + PRINTMARK(); + return JT_NULL; + } + GET_TC(tc)->doubleValue = (double)doubleVal; + GET_TC(tc)->PyTypeToJSON = CDouble; + return JT_DOUBLE; } - castfunc(value, &doubleVal, 1, NULL, NULL); - if (npy_isnan(doubleVal) || npy_isinf(doubleVal)) - { - PRINTMARK(); - return JT_NULL; + + if (PyTypeNum_ISDATETIME(npyType)) { + PRINTMARK(); + castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + npyType); + } + castfunc(value, &longVal, 1, NULL, NULL); + if (longVal == get_nat()) { + PRINTMARK(); + return JT_NULL; + } + GET_TC(tc)->longValue = (JSINT64)longVal; + GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON; + return ((PyObjectEncoder *)tc->encoder)->datetimeIso ? JT_UTF8 + : JT_LONG; } - GET_TC(tc)->doubleValue = (double) doubleVal; - GET_TC(tc)->PyTypeToJSON = CDouble; - return JT_DOUBLE; - } - if (PyTypeNum_ISDATETIME(npyType)) - { - PRINTMARK(); - castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); - if (!castfunc) - { - PyErr_Format ( - PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - npyType); + if (PyTypeNum_ISINTEGER(npyType)) { + PRINTMARK(); + castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + npyType); + } + castfunc(value, &longVal, 1, NULL, NULL); + GET_TC(tc)->longValue = (JSINT64)longVal; + GET_TC(tc)->PyTypeToJSON = CLong; + return JT_LONG; } - castfunc(value, &longVal, 1, NULL, NULL); - if (longVal == get_nat()) - { - PRINTMARK(); - return JT_NULL; + + if (PyTypeNum_ISBOOL(npyType)) { + PRINTMARK(); + return *((npy_bool *)value) == NPY_TRUE ? JT_TRUE : JT_FALSE; } - GET_TC(tc)->longValue = (JSINT64) longVal; - GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON; - return ((PyObjectEncoder *) tc->encoder)->datetimeIso ? JT_UTF8 : JT_LONG; - } - if (PyTypeNum_ISINTEGER(npyType)) - { - PRINTMARK(); - castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); - if (!castfunc) - { - PyErr_Format ( - PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - npyType); - } - castfunc(value, &longVal, 1, NULL, NULL); - GET_TC(tc)->longValue = (JSINT64) longVal; - GET_TC(tc)->PyTypeToJSON = CLong; - return JT_LONG; - } - - if (PyTypeNum_ISBOOL(npyType)) - { PRINTMARK(); - return *((npy_bool *) value) == NPY_TRUE ? JT_TRUE : JT_FALSE; - } - - PRINTMARK(); - return JT_INVALID; + return JT_INVALID; } - //============================================================================= // Numpy array iteration functions //============================================================================= -static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) -{ - if (GET_TC(tc)->npyarr && GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) - { - PRINTMARK(); - Py_XDECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) { + if (GET_TC(tc)->npyarr && + GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { + PRINTMARK(); + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } } -int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) -{ - return 0; -} +int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) { return 0; } -void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) -{ - PyArrayObject *obj; - NpyArrContext *npyarr; - - if (GET_TC(tc)->newObj) - { - obj = (PyArrayObject *) GET_TC(tc)->newObj; - } - else - { - obj = (PyArrayObject *) _obj; - } - - if (PyArray_SIZE(obj) < 0) - { - PRINTMARK(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - } - else - { - PRINTMARK(); - npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - GET_TC(tc)->npyarr = npyarr; +void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { + PyArrayObject *obj; + NpyArrContext *npyarr; - if (!npyarr) - { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + if (GET_TC(tc)->newObj) { + obj = (PyArrayObject *)GET_TC(tc)->newObj; + } else { + obj = (PyArrayObject *)_obj; } - npyarr->array = (PyObject*) obj; - npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; - npyarr->dataptr = PyArray_DATA(obj); - npyarr->ndim = PyArray_NDIM(obj) - 1; - npyarr->curdim = 0; - npyarr->type_num = PyArray_DESCR(obj)->type_num; + if (PyArray_SIZE(obj) < 0) { + PRINTMARK(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } else { + PRINTMARK(); + npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; - if (GET_TC(tc)->transpose) - { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } - else - { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } + if (!npyarr) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + npyarr->array = (PyObject *)obj; + npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + npyarr->type_num = PyArray_DESCR(obj)->type_num; + + if (GET_TC(tc)->transpose) { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } else { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } - npyarr->columnLabels = GET_TC(tc)->columnLabels; - npyarr->rowLabels = GET_TC(tc)->rowLabels; - } + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; + } } -void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); +void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + PRINTMARK(); - if (npyarr) - { - NpyArr_freeItemValue(obj, tc); - PyObject_Free(npyarr); - } + if (npyarr) { + NpyArr_freeItemValue(obj, tc); + PyObject_Free(npyarr); + } } -void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - PRINTMARK(); -} +void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } -void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - NpyArrContext* npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); - // finished this dimension, reset the data pointer - npyarr->curdim--; - npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; - npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->dataptr += npyarr->stride; +void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + PRINTMARK(); + // finished this dimension, reset the data pointer + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } -int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) -{ - NpyArrContext* npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); +int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + PRINTMARK(); - if (PyErr_Occurred()) - { - return 0; - } + if (PyErr_Occurred()) { + return 0; + } - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) - { - PRINTMARK(); - return 0; - } + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + PRINTMARK(); + return 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); #if NPY_API_VERSION < 0x00000007 - if(PyArray_ISDATETIME(npyarr->array)) - { - PRINTMARK(); - GET_TC(tc)->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); - } - else - if (PyArray_ISNUMBER(npyarr->array)) + if (PyArray_ISDATETIME(npyarr->array)) { + PRINTMARK(); + GET_TC(tc) + ->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); + } else if (PyArray_ISNUMBER(npyarr->array)) // NOLINT #else - if (PyArray_ISNUMBER(npyarr->array) || PyArray_ISDATETIME(npyarr->array)) + if (PyArray_ISNUMBER(npyarr->array) || PyArray_ISDATETIME(npyarr->array)) // NOLINT #endif - { - PRINTMARK(); - GET_TC(tc)->itemValue = obj; - Py_INCREF(obj); - ((PyObjectEncoder*) tc->encoder)->npyType = PyArray_TYPE(npyarr->array); - ((PyObjectEncoder*) tc->encoder)->npyValue = npyarr->dataptr; - ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; - } - else - { - PRINTMARK(); - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); - } + { + PRINTMARK(); + GET_TC(tc)->itemValue = obj; + Py_INCREF(obj); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + } else { + PRINTMARK(); + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + } - npyarr->dataptr += npyarr->stride; - npyarr->index[npyarr->stridedim]++; - return 1; + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; } -int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) -{ - NpyArrContext* npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); - - if (PyErr_Occurred()) - { +int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; PRINTMARK(); - return 0; - } - if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) - { - PRINTMARK(); - // innermost dimension, start retrieving item values - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - return NpyArr_iterNextItem(_obj, tc); - } + if (PyErr_Occurred()) { + PRINTMARK(); + return 0; + } + + if (npyarr->curdim >= npyarr->ndim || + npyarr->index[npyarr->stridedim] >= npyarr->dim) { + PRINTMARK(); + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } - // dig a dimension deeper - npyarr->index[npyarr->stridedim]++; + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; - npyarr->curdim++; - npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->index[npyarr->stridedim] = 0; + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; - ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; - GET_TC(tc)->itemValue = npyarr->array; - return 1; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; } -JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - PRINTMARK(); - return GET_TC(tc)->itemValue; +JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); + return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, npy_intp idx, char** labels) -{ - JSONObjectEncoder* enc = (JSONObjectEncoder*) tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - memcpy(enc->offset, labels[idx], sizeof(char)*(*outLen)); - enc->offset += *outLen; - *outLen = 0; +static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, + npy_intp idx, char **labels) { + JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; + PRINTMARK(); + *outLen = strlen(labels[idx]); + memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); + enc->offset += *outLen; + *outLen = 0; } -char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - NpyArrContext* npyarr = GET_TC(tc)->npyarr; - npy_intp idx; - PRINTMARK(); +char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + npy_intp idx; + PRINTMARK(); - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) - { - idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } - else - { - idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + idx = npyarr->index[npyarr->stridedim] - 1; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + } else { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + } + return NULL; } - //============================================================================= // Pandas block iteration functions // @@ -878,442 +784,381 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) // Uses a dedicated NpyArrContext for each column. //============================================================================= +void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PRINTMARK(); -void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) - { - blkCtxt->colIdx++; - } - else - { - blkCtxt->colIdx = 0; - } + if (blkCtxt->transpose) { + blkCtxt->colIdx++; + } else { + blkCtxt->colIdx = 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } -int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); +int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PRINTMARK(); - if (blkCtxt->colIdx >= blkCtxt->ncols) - { - return 0; - } + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; + } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + blkCtxt->colIdx++; + return NpyArr_iterNextItem(obj, tc); } -char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) - { - idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } - else - { - idx = GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; - - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext* npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - npy_intp idx; - PRINTMARK(); - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) - { - idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); - } - else - { - idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); - } - return NULL; -} - -int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext* npyarr; - PRINTMARK(); - - if (PyErr_Occurred()) - { - return 0; - } - - if (blkCtxt->transpose) - { - if (blkCtxt->colIdx >= blkCtxt->ncols) - { - return 0; - } - } - else - { - npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) - { - return 0; - } - } +char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; + npy_intp idx; + PRINTMARK(); - ((PyObjectEncoder*) tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; + if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { + idx = blkCtxt->colIdx - 1; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + } else { + idx = GET_TC(tc)->iterNext != PdBlock_iterNext + ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 + : npyarr->index[npyarr->stridedim]; - return 1; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + } + return NULL; } -void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); - - if (blkCtxt->transpose) - { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } -} +char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + npy_intp idx; + PRINTMARK(); -void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) -{ - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - Py_ssize_t i; - PyArray_Descr *dtype; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; - - PRINTMARK(); - - i = 0; - blocks = NULL; - dtype = PyArray_DescrFromType(NPY_INT64); - obj = (PyObject *)_obj; - - GET_TC(tc)->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose : PdBlock_iterGetName; - - blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) - { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - GET_TC(tc)->pdblock = blkCtxt; - - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); - - if (blkCtxt->ncols == 0) - { - blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; - - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blkCtxt->npyCtxts = PyObject_Malloc(sizeof(NpyArrContext*) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) - { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - for (i = 0; i < blkCtxt->ncols; i++) - { - blkCtxt->npyCtxts[i] = NULL; - } - - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) - { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - blocks = get_sub_attr(obj, "_data", "blocks"); - if (!blocks) - { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) - { - block = get_item(blocks, i); - if (!block) - { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + idx = npyarr->index[npyarr->stridedim] - 1; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + } else { + idx = blkCtxt->colIdx; + NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); } + return NULL; +} - tmp = get_values(block); - if (!tmp) - { - ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } +int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr; + PRINTMARK(); - values = PyArray_Transpose((PyArrayObject*) tmp, NULL); - Py_DECREF(tmp); - if (!values) - { - Py_DECREF(block); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + if (PyErr_Occurred()) { + return 0; } - - locs = (PyArrayObject*) get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) - { - Py_DECREF(block); - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + if (blkCtxt->transpose) { + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; + } + } else { + npyarr = blkCtxt->npyCtxts[0]; + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + return 0; + } } - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, NPY_NO_CASTING, dtype); - if (!iter) - { - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) - { - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **) NpyIter_GetDataPtrArray(iter); - do - { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); + ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; + GET_TC(tc)->itemValue = obj; - blkCtxt->cindices[colIdx] = idx; + return 1; +} - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; +void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PRINTMARK(); - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; + if (blkCtxt->transpose) { + // if transposed we exhaust each column before moving to the next + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + } +} - // set the dataptr to our desired column and initialise - if (npyarr != NULL) - { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); +void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj, *blocks, *block, *values, *tmp; + PyArrayObject *locs; + PdBlockContext *blkCtxt; + NpyArrContext *npyarr; + Py_ssize_t i; + PyArray_Descr *dtype; + NpyIter *iter; + NpyIter_IterNextFunc *iternext; + npy_int64 **dataptr; + npy_int64 colIdx; + npy_intp idx; + + PRINTMARK(); + + i = 0; + blocks = NULL; + dtype = PyArray_DescrFromType(NPY_INT64); + obj = (PyObject *)_obj; + + GET_TC(tc) + ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; + + blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); + if (!blkCtxt) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + GET_TC(tc)->pdblock = blkCtxt; + + blkCtxt->colIdx = 0; + blkCtxt->transpose = GET_TC(tc)->transpose; + blkCtxt->ncols = get_attr_length(obj, "columns"); + + if (blkCtxt->ncols == 0) { + blkCtxt->npyCtxts = NULL; + blkCtxt->cindices = NULL; + + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + + blkCtxt->npyCtxts = + PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); + if (!blkCtxt->npyCtxts) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + for (i = 0; i < blkCtxt->ncols; i++) { + blkCtxt->npyCtxts[i] = NULL; + } + + blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); + if (!blkCtxt->cindices) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + + blocks = get_sub_attr(obj, "_data", "blocks"); + if (!blocks) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + + // force transpose so each NpyArrContext strides down its column + GET_TC(tc)->transpose = 1; + + for (i = 0; i < PyObject_Length(blocks); i++) { + block = get_item(blocks, i); + if (!block) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = NULL; - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; + tmp = get_values(block); + if (!tmp) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + Py_DECREF(block); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + + values = PyArray_Transpose((PyArrayObject *)tmp, NULL); + Py_DECREF(tmp); + if (!values) { + Py_DECREF(block); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } - } while (iternext(iter)); + locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); + if (!locs) { + Py_DECREF(block); + Py_DECREF(values); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + + iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, + NPY_NO_CASTING, dtype); + if (!iter) { + Py_DECREF(block); + Py_DECREF(values); + Py_DECREF(locs); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { + NpyIter_Deallocate(iter); + Py_DECREF(block); + Py_DECREF(values); + Py_DECREF(locs); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto BLKRET; + } + dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); + do { + colIdx = **dataptr; + idx = NpyIter_GetIterIndex(iter); + + blkCtxt->cindices[colIdx] = idx; + + // Reference freed in Pdblock_iterend + Py_INCREF(values); + GET_TC(tc)->newObj = values; + + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); + npyarr = GET_TC(tc)->npyarr; + + // set the dataptr to our desired column and initialise + if (npyarr != NULL) { + npyarr->dataptr += npyarr->stride * idx; + NpyArr_iterNext(obj, tc); + } + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - NpyIter_Deallocate(iter); - Py_DECREF(block); - Py_DECREF(values); - Py_DECREF(locs); - } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + blkCtxt->npyCtxts[colIdx] = npyarr; + GET_TC(tc)->newObj = NULL; + } while (iternext(iter)); + + NpyIter_Deallocate(iter); + Py_DECREF(block); + Py_DECREF(values); + Py_DECREF(locs); + } + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; BLKRET: - Py_XDECREF(dtype); - Py_XDECREF(blocks); + Py_XDECREF(dtype); + Py_XDECREF(blocks); } -void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - int i; - PRINTMARK(); +void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt; + NpyArrContext *npyarr; + int i; + PRINTMARK(); - GET_TC(tc)->itemValue = NULL; - npyarr = GET_TC(tc)->npyarr; + GET_TC(tc)->itemValue = NULL; + npyarr = GET_TC(tc)->npyarr; - blkCtxt = GET_TC(tc)->pdblock; + blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt) - { - for (i = 0; i < blkCtxt->ncols; i++) - { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) - { - if (npyarr->array) - { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } + if (blkCtxt) { + for (i = 0; i < blkCtxt->ncols; i++) { + npyarr = blkCtxt->npyCtxts[i]; + if (npyarr) { + if (npyarr->array) { + Py_DECREF(npyarr->array); + npyarr->array = NULL; + } - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); + GET_TC(tc)->npyarr = npyarr; + NpyArr_iterEnd(obj, tc); - blkCtxt->npyCtxts[i] = NULL; - } - } + blkCtxt->npyCtxts[i] = NULL; + } + } - if (blkCtxt->npyCtxts) - { - PyObject_Free(blkCtxt->npyCtxts); - } - if (blkCtxt->cindices) - { - PyObject_Free(blkCtxt->cindices); + if (blkCtxt->npyCtxts) { + PyObject_Free(blkCtxt->npyCtxts); + } + if (blkCtxt->cindices) { + PyObject_Free(blkCtxt->cindices); + } + PyObject_Free(blkCtxt); } - PyObject_Free(blkCtxt); - } } - //============================================================================= // Tuple iteration functions // itemValue is borrowed reference, no ref counting //============================================================================= -void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyTuple_GET_SIZE( (PyObject *) obj); - GET_TC(tc)->itemValue = NULL; +void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); + GET_TC(tc)->itemValue = NULL; } -int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - PyObject *item; +int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { + PyObject *item; - if (GET_TC(tc)->index >= GET_TC(tc)->size) - { - return 0; - } + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + return 0; + } - item = PyTuple_GET_ITEM (obj, GET_TC(tc)->index); + item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->itemValue = item; - GET_TC(tc)->index ++; - return 1; + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index++; + return 1; } -void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ -} +void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} -JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - return NULL; +char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + return NULL; } //============================================================================= // Iterator iteration functions // itemValue is borrowed reference, no ref counting //============================================================================= -void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->itemValue = NULL; - GET_TC(tc)->iterator = PyObject_GetIter(obj); +void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->itemValue = NULL; + GET_TC(tc)->iterator = PyObject_GetIter(obj); } -int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - PyObject *item; +int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) { + PyObject *item; - if (GET_TC(tc)->itemValue) - { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - item = PyIter_Next(GET_TC(tc)->iterator); + item = PyIter_Next(GET_TC(tc)->iterator); - if (item == NULL) - { - return 0; - } + if (item == NULL) { + return 0; + } - GET_TC(tc)->itemValue = item; - return 1; + GET_TC(tc)->itemValue = item; + return 1; } -void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - if (GET_TC(tc)->itemValue) - { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->iterator) - { - Py_DECREF(GET_TC(tc)->iterator); - GET_TC(tc)->iterator = NULL; - } + if (GET_TC(tc)->iterator) { + Py_DECREF(GET_TC(tc)->iterator); + GET_TC(tc)->iterator = NULL; + } } -JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - return NULL; +char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + return NULL; } //============================================================================= @@ -1321,387 +1166,312 @@ char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) // itemName ref is borrowed from PyObject_Dir (attrList). No refcount // itemValue ref is from PyObject_GetAttr. Ref counted //============================================================================= -void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->attrList = PyObject_Dir(obj); - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); - PRINTMARK(); +void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + PRINTMARK(); } -void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - if (GET_TC(tc)->itemValue) - { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->itemName) - { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } - Py_DECREF( (PyObject *) GET_TC(tc)->attrList); - PRINTMARK(); + Py_DECREF((PyObject *)GET_TC(tc)->attrList); + PRINTMARK(); } -int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) -{ - PyObject *obj = (PyObject *) _obj; - PyObject *itemValue = GET_TC(tc)->itemValue; - PyObject *itemName = GET_TC(tc)->itemName; - PyObject* attr; - PyObject* attrName; - char* attrStr; - - if (itemValue) - { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = itemValue = NULL; - } - - if (itemName) - { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = itemName = NULL; - } - - for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index ++) - { - attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); +int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj = (PyObject *)_obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; + PyObject *attr; + PyObject *attrName; + char *attrStr; + + if (itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + if (itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { + attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); #if PY_MAJOR_VERSION >= 3 - attr = PyUnicode_AsUTF8String(attrName); + attr = PyUnicode_AsUTF8String(attrName); #else - attr = attrName; - Py_INCREF(attr); + attr = attrName; + Py_INCREF(attr); #endif - attrStr = PyString_AS_STRING(attr); + attrStr = PyString_AS_STRING(attr); - if (attrStr[0] == '_') - { - PRINTMARK(); - Py_DECREF(attr); - continue; - } + if (attrStr[0] == '_') { + PRINTMARK(); + Py_DECREF(attr); + continue; + } - itemValue = PyObject_GetAttr(obj, attrName); - if (itemValue == NULL) - { - PyErr_Clear(); - Py_DECREF(attr); - PRINTMARK(); - continue; + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) { + PyErr_Clear(); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + if (PyCallable_Check(itemValue)) { + Py_DECREF(itemValue); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index++; + + PRINTMARK(); + itemName = attr; + break; } - if (PyCallable_Check(itemValue)) - { - Py_DECREF(itemValue); - Py_DECREF(attr); - PRINTMARK(); - continue; + if (itemName == NULL) { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; } GET_TC(tc)->itemName = itemName; GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index ++; + GET_TC(tc)->index++; PRINTMARK(); - itemName = attr; - break; - } - - if (itemName == NULL) - { - GET_TC(tc)->index = GET_TC(tc)->size; - GET_TC(tc)->itemValue = NULL; - return 0; - } - - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index ++; - - PRINTMARK(); - return 1; + return 1; } -JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - PRINTMARK(); - return GET_TC(tc)->itemValue; +JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); + return GET_TC(tc)->itemValue; } -char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - PRINTMARK(); - *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); - return PyString_AS_STRING(GET_TC(tc)->itemName); +char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + PRINTMARK(); + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); } - //============================================================================= // List iteration functions // itemValue is borrowed from object (which is list). No refcounting //============================================================================= -void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE( (PyObject *) obj); +void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); } -int List_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - if (GET_TC(tc)->index >= GET_TC(tc)->size) - { - PRINTMARK(); - return 0; - } +int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + PRINTMARK(); + return 0; + } - GET_TC(tc)->itemValue = PyList_GET_ITEM (obj, GET_TC(tc)->index); - GET_TC(tc)->index ++; - return 1; + GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); + GET_TC(tc)->index++; + return 1; } -void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ -} +void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} -JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - return NULL; +char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + return NULL; } //============================================================================= // pandas Index iteration functions //============================================================================= -void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) - { - PyErr_NoMemory(); - } - PRINTMARK(); +void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } + PRINTMARK(); } -int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - Py_ssize_t index; - if (!GET_TC(tc)->cStr) - { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) - { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } - else - if (index == 1) - { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) - { - return 0; +int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { + return 0; } - } - else - { - PRINTMARK(); - return 0; - } - GET_TC(tc)->index++; - PRINTMARK(); - return 1; -} + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { + PRINTMARK(); + return 0; + } -void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - PRINTMARK(); + GET_TC(tc)->index++; + PRINTMARK(); + return 1; } -JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } + +JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas Series iteration functions //============================================================================= -void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) - { - PyErr_NoMemory(); - } - PRINTMARK(); -} - -int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - Py_ssize_t index; - if (!GET_TC(tc)->cStr) - { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) - { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } - else - if (index == 1) - { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } - else - if (index == 2) - { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) - { - return 0; +void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); } - } - else - { PRINTMARK(); - return 0; - } +} + +int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { + PRINTMARK(); + return 0; + } - GET_TC(tc)->index++; - PRINTMARK(); - return 1; + GET_TC(tc)->index++; + PRINTMARK(); + return 1; } -void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; - enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); +void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + PRINTMARK(); } -JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas DataFrame iteration functions //============================================================================= -void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) - { - PyErr_NoMemory(); - } - PRINTMARK(); -} - -int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - Py_ssize_t index; - if (!GET_TC(tc)->cStr) - { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) - { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char)*8); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } - else - if (index == 1) - { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } - else - if (index == 2) - { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); - if (is_simple_frame(obj)) - { - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) - { +void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { return 0; - } } - else - { - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + if (is_simple_frame(obj)) { + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { + Py_INCREF(obj); + GET_TC(tc)->itemValue = obj; + } + } else { + PRINTMARK(); + return 0; } - } - else - { - PRINTMARK(); - return 0; - } - GET_TC(tc)->index++; - PRINTMARK(); - return 1; + GET_TC(tc)->index++; + PRINTMARK(); + return 1; } -void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; - enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); +void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + PRINTMARK(); } -JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -1709,124 +1479,105 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) // itemName might converted to string (Python_Str). Do refCounting // itemValue is borrowed from object (which is dict). No refCounting //============================================================================= -void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->index = 0; - PRINTMARK(); +void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + PRINTMARK(); } -int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ +int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) { #if PY_MAJOR_VERSION >= 3 - PyObject* itemNameTmp; + PyObject *itemNameTmp; #endif - if (GET_TC(tc)->itemName) - { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, + &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { + PRINTMARK(); + return 0; + } - if (!PyDict_Next ( (PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) - { - PRINTMARK(); - return 0; - } - - if (PyUnicode_Check(GET_TC(tc)->itemName)) - { - GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); - } - else - if (!PyString_Check(GET_TC(tc)->itemName)) - { - GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); + if (PyUnicode_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + } else if (!PyString_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); #if PY_MAJOR_VERSION >= 3 - itemNameTmp = GET_TC(tc)->itemName; - GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); - Py_DECREF(itemNameTmp); + itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); #endif - } - else - { - Py_INCREF(GET_TC(tc)->itemName); + } else { + Py_INCREF(GET_TC(tc)->itemName); } PRINTMARK(); return 1; } -void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - if (GET_TC(tc)->itemName) - { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - Py_DECREF(GET_TC(tc)->dictObj); - PRINTMARK(); +void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); + PRINTMARK(); } -JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->itemValue; +JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); - return PyString_AS_STRING(GET_TC(tc)->itemName); +char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); } -void NpyArr_freeLabels(char** labels, npy_intp len) -{ +void NpyArr_freeLabels(char **labels, npy_intp len) { npy_intp i; - if (labels) - { - for (i = 0; i < len; i++) - { + if (labels) { + for (i = 0; i < len; i++) { PyObject_Free(labels[i]); } PyObject_Free(labels); } } -char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) -{ +char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, + npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder* pyenc = (PyObjectEncoder *) enc; - PyObject* item = NULL; + PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; + PyObject *item = NULL; npy_intp i, stride, len, need_quotes; - char** ret; + char **ret; char *dataptr, *cLabel, *origend, *origst, *origoffset; char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc* getitem; + PyArray_GetItemFunc *getitem; int type_num; PRINTMARK(); - if (!labels) - { - return 0; + if (!labels) { + return 0; } - if (PyArray_SIZE(labels) < num) - { - PyErr_SetString(PyExc_ValueError, "Label array sizes do not match corresponding data shape"); + if (PyArray_SIZE(labels) < num) { + PyErr_SetString( + PyExc_ValueError, + "Label array sizes do not match corresponding data shape"); Py_DECREF(labels); return 0; } - ret = PyObject_Malloc(sizeof(char*)*num); - if (!ret) - { + ret = PyObject_Malloc(sizeof(char *) * num); + if (!ret) { PyErr_NoMemory(); Py_DECREF(labels); return 0; } - for (i = 0; i < num; i++) - { + for (i = 0; i < num; i++) { ret[i] = NULL; } @@ -1836,45 +1587,37 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc*) PyArray_DESCR(labels)->f->getitem; + getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); - for (i = 0; i < num; i++) - { + for (i = 0; i < num; i++) { #if NPY_API_VERSION < 0x00000007 - if(PyTypeNum_ISDATETIME(type_num)) - { - item = PyArray_ToScalar(dataptr, labels); - } - else if(PyTypeNum_ISNUMBER(type_num)) + if (PyTypeNum_ISDATETIME(type_num)) { + item = PyArray_ToScalar(dataptr, labels); + } else if (PyTypeNum_ISNUMBER(type_num)) // NOLINT #else - if(PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) + if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) // NOLINT #endif { - item = (PyObject *) labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } - else - { - item = getitem(dataptr, labels); - if (!item) - { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + item = (PyObject *)labels; + pyenc->npyType = type_num; + pyenc->npyValue = dataptr; + } else { + item = getitem(dataptr, labels); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); - if (item != (PyObject *) labels) - { - Py_DECREF(item); + if (item != (PyObject *)labels) { + Py_DECREF(item); } - if (PyErr_Occurred() || enc->errorMsg) - { + if (PyErr_Occurred() || enc->errorMsg) { NpyArr_freeLabels(ret, num); ret = 0; break; @@ -1882,27 +1625,23 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in need_quotes = ((*cLabel) != '"'); len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char)*len); + ret[i] = PyObject_Malloc(sizeof(char) * len); - if (!ret[i]) - { + if (!ret[i]) { PyErr_NoMemory(); ret = 0; break; } - if (need_quotes) - { - ret[i][0] = '"'; - memcpy(ret[i]+1, cLabel, sizeof(char)*(len-4)); - ret[i][len-3] = '"'; - } - else - { - memcpy(ret[i], cLabel, sizeof(char)*(len-2)); + if (need_quotes) { + ret[i][0] = '"'; + memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); + ret[i][len - 3] = '"'; + } else { + memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); } - ret[i][len-2] = ':'; - ret[i][len-1] = '\0'; + ret[i][len - 2] = ':'; + ret[i][len - 1] = '\0'; dataptr += stride; } @@ -1914,772 +1653,650 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in return ret; } -void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) -{ - PyObject *tmpObj = NULL; - PRINTMARK(); - tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); - if (!PyErr_Occurred()) - { - if (tmpObj == NULL) - { - PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); - } - else - { - encode (tmpObj, (JSONObjectEncoder*) enc, NULL, 0); +void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { + PyObject *tmpObj = NULL; + PRINTMARK(); + tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); + if (!PyErr_Occurred()) { + if (tmpObj == NULL) { + PyErr_SetString(PyExc_TypeError, + "Failed to execute default handler"); + } else { + encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); + } } - } - Py_XDECREF(tmpObj); - return; + Py_XDECREF(tmpObj); + return; } -void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) -{ - PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; - TypeContext *pc; - PyObjectEncoder *enc; - double val; - npy_int64 value; - int base; - PRINTMARK(); - - tc->prv = NULL; +void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; + TypeContext *pc; + PyObjectEncoder *enc; + double val; + npy_int64 value; + int base; + PRINTMARK(); - if (!_obj) { - tc->type = JT_INVALID; - return; - } + tc->prv = NULL; - obj = (PyObject*) _obj; - enc = (PyObjectEncoder*) tc->encoder; + if (!_obj) { + tc->type = JT_INVALID; + return; + } - if (enc->npyType >= 0) - { - PRINTMARK(); - tc->prv = &(enc->basicTypeContext); - tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue); + obj = (PyObject *)_obj; + enc = (PyObjectEncoder *)tc->encoder; - if (tc->type == JT_INVALID) - { - if(enc->defaultHandler) - { + if (enc->npyType >= 0) { + PRINTMARK(); + tc->prv = &(enc->basicTypeContext); + tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue); + + if (tc->type == JT_INVALID) { + if (enc->defaultHandler) { + enc->npyType = -1; + PRINTMARK(); + Object_invokeDefaultHandler( + enc->npyCtxtPassthru->getitem(enc->npyValue, + enc->npyCtxtPassthru->array), + enc); + } else { + PyErr_Format(PyExc_RuntimeError, "Unhandled numpy dtype %d", + enc->npyType); + } + } + enc->npyCtxtPassthru = NULL; enc->npyType = -1; + return; + } + + if (PyBool_Check(obj)) { PRINTMARK(); - Object_invokeDefaultHandler(enc->npyCtxtPassthru->getitem(enc->npyValue, enc->npyCtxtPassthru->array), enc); - } - else - { - PyErr_Format ( - PyExc_RuntimeError, - "Unhandled numpy dtype %d", - enc->npyType); - } - } - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; - } + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } else if (obj == Py_None) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } - if (PyBool_Check(obj)) - { - PRINTMARK(); - tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; - return; - } - else - if (obj == Py_None) - { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - - pc = createTypeContext(); - if (!pc) - { - tc->type = JT_INVALID; - return; - } - tc->prv = pc; + pc = createTypeContext(); + if (!pc) { + tc->type = JT_INVALID; + return; + } + tc->prv = pc; - if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj) )) - { - PRINTMARK(); - goto ISITERABLE; - } + if (PyIter_Check(obj) || + (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { + PRINTMARK(); + goto ISITERABLE; + } - if (PyLong_Check(obj)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; - tc->type = JT_LONG; - GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + if (PyLong_Check(obj)) { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + GET_TC(tc)->longValue = PyLong_AsLongLong(obj); - exc = PyErr_Occurred(); + exc = PyErr_Occurred(); - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) - { - PRINTMARK(); - goto INVALID; - } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } - return; - } - else - if (PyInt_Check(obj)) - { - PRINTMARK(); + return; + } else if (PyInt_Check(obj)) { + PRINTMARK(); #ifdef _LP64 - pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; + pc->PyTypeToJSON = PyIntToINT64; + tc->type = JT_LONG; #else - pc->PyTypeToJSON = PyIntToINT32; tc->type = JT_INT; + pc->PyTypeToJSON = PyIntToINT32; + tc->type = JT_INT; #endif - return; - } - else - if (PyFloat_Check(obj)) - { - PRINTMARK(); - val = PyFloat_AS_DOUBLE (obj); - if (npy_isnan(val) || npy_isinf(val)) - { - tc->type = JT_NULL; - } - else - { - pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; - } - return; - } - else - if (PyString_Check(obj)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; - return; - } - else - if (PyUnicode_Check(obj)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyUnicodeToUTF8; tc->type = JT_UTF8; - return; - } - else - if (PyObject_IsInstance(obj, type_decimal)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; - return; - } - else - if (PyDateTime_Check(obj) || PyDate_Check(obj)) - { - if (PyObject_TypeCheck(obj, cls_nat)) - { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - - PRINTMARK(); - pc->PyTypeToJSON = PyDateTimeToJSON; - if (enc->datetimeIso) - { - PRINTMARK(); - tc->type = JT_UTF8; - } - else - { - PRINTMARK(); - tc->type = JT_LONG; - } - return; - } - else - if (PyTime_Check(obj)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyTimeToJSON; tc->type = JT_UTF8; - return; - } - else - if (PyArray_IsScalar(obj, Datetime)) - { - PRINTMARK(); - if (((PyDatetimeScalarObject*) obj)->obval == get_nat()) { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - - PRINTMARK(); - pc->PyTypeToJSON = NpyDateTimeScalarToJSON; - tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG; - return; - } - else - if (PyDelta_Check(obj)) - { - if (PyObject_HasAttrString(obj, "value")) - { - PRINTMARK(); - value = get_long_attr(obj, "value"); - } - else - { - PRINTMARK(); - value = total_seconds(obj) * 1000000000LL; // nanoseconds per second - } - - base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; - switch (base) - { - case PANDAS_FR_ns: - break; - case PANDAS_FR_us: - value /= 1000LL; - break; - case PANDAS_FR_ms: - value /= 1000000LL; - break; - case PANDAS_FR_s: - value /= 1000000000LL; - break; - } - - exc = PyErr_Occurred(); + return; + } else if (PyFloat_Check(obj)) { + PRINTMARK(); + val = PyFloat_AS_DOUBLE(obj); + if (npy_isnan(val) || npy_isinf(val)) { + tc->type = JT_NULL; + } else { + pc->PyTypeToJSON = PyFloatToDOUBLE; + tc->type = JT_DOUBLE; + } + return; + } else if (PyString_Check(obj)) { + PRINTMARK(); + pc->PyTypeToJSON = PyStringToUTF8; + tc->type = JT_UTF8; + return; + } else if (PyUnicode_Check(obj)) { + PRINTMARK(); + pc->PyTypeToJSON = PyUnicodeToUTF8; + tc->type = JT_UTF8; + return; + } else if (PyObject_IsInstance(obj, type_decimal)) { + PRINTMARK(); + pc->PyTypeToJSON = PyFloatToDOUBLE; + tc->type = JT_DOUBLE; + return; + } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { + if (PyObject_TypeCheck(obj, cls_nat)) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) - { - PRINTMARK(); - goto INVALID; - } + PRINTMARK(); + pc->PyTypeToJSON = PyDateTimeToJSON; + if (enc->datetimeIso) { + PRINTMARK(); + tc->type = JT_UTF8; + } else { + PRINTMARK(); + tc->type = JT_LONG; + } + return; + } else if (PyTime_Check(obj)) { + PRINTMARK(); + pc->PyTypeToJSON = PyTimeToJSON; + tc->type = JT_UTF8; + return; + } else if (PyArray_IsScalar(obj, Datetime)) { + PRINTMARK(); + if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } - if (value == get_nat()) - { - PRINTMARK(); - tc->type = JT_NULL; - return; - } + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeScalarToJSON; + tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG; + return; + } else if (PyDelta_Check(obj)) { + if (PyObject_HasAttrString(obj, "value")) { + PRINTMARK(); + value = get_long_attr(obj, "value"); + } else { + PRINTMARK(); + value = + total_seconds(obj) * 1000000000LL; // nanoseconds per second + } - GET_TC(tc)->longValue = value; + base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + switch (base) { + case PANDAS_FR_ns: + break; + case PANDAS_FR_us: + value /= 1000LL; + break; + case PANDAS_FR_ms: + value /= 1000000LL; + break; + case PANDAS_FR_s: + value /= 1000000000LL; + break; + } - PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; - tc->type = JT_LONG; - return; - } - else - if (PyArray_IsScalar(obj, Integer)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; - tc->type = JT_LONG; - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); + exc = PyErr_Occurred(); - exc = PyErr_Occurred(); + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) - { - PRINTMARK(); - goto INVALID; - } + if (value == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } - return; - } - else - if (PyArray_IsScalar(obj, Bool)) - { - PRINTMARK(); - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); - tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; - return; - } - else - if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) - { - PRINTMARK(); - pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; - return; - } - else - if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { - tmpObj = PyObject_Repr(obj); - PyErr_Format( - PyExc_TypeError, - "%s (0d array) is not JSON serializable at the moment", - PyString_AS_STRING(tmpObj) - ); - Py_DECREF(tmpObj); - goto INVALID; - } + GET_TC(tc)->longValue = value; -ISITERABLE: + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + return; + } else if (PyArray_IsScalar(obj, Integer)) { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), + PyArray_DescrFromType(NPY_INT64)); - if (PyObject_TypeCheck(obj, cls_index)) - { - if (enc->outputFormat == SPLIT) - { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = Index_iterBegin; - pc->iterEnd = Index_iterEnd; - pc->iterNext = Index_iterNext; - pc->iterGetValue = Index_iterGetValue; - pc->iterGetName = Index_iterGetName; - return; - } - - pc->newObj = get_values(obj); - if (pc->newObj) - { - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - } - else - { - goto INVALID; - } + exc = PyErr_Occurred(); - return; - } - else - if (PyObject_TypeCheck(obj, cls_series)) - { - if (enc->outputFormat == SPLIT) - { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = Series_iterBegin; - pc->iterEnd = Series_iterEnd; - pc->iterNext = Series_iterNext; - pc->iterGetValue = Series_iterGetValue; - pc->iterGetName = Series_iterGetName; - return; - } - - pc->newObj = get_values(obj); - if (!pc->newObj) - { - goto INVALID; - } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } - if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) - { - PRINTMARK(); - tc->type = JT_OBJECT; - tmpObj = PyObject_GetAttrString(obj, "index"); - if (!tmpObj) - { - goto INVALID; - } - values = get_values(tmpObj); - Py_DECREF(tmpObj); - if (!values) - { - goto INVALID; - } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); - if (!pc->columnLabels) - { + return; + } else if (PyArray_IsScalar(obj, Bool)) { + PRINTMARK(); + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), + PyArray_DescrFromType(NPY_BOOL)); + tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; + return; + } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { + PRINTMARK(); + pc->PyTypeToJSON = NpyFloatToDOUBLE; + tc->type = JT_DOUBLE; + return; + } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { + tmpObj = PyObject_Repr(obj); + PyErr_Format(PyExc_TypeError, + "%s (0d array) is not JSON serializable at the moment", + PyString_AS_STRING(tmpObj)); + Py_DECREF(tmpObj); goto INVALID; - } } - else - { - PRINTMARK(); - tc->type = JT_ARRAY; - } - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } - else - if (PyArray_Check(obj)) - { - if (enc->npyCtxtPassthru) - { - PRINTMARK(); - pc->npyarr = enc->npyCtxtPassthru; - tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); - pc->iterBegin = NpyArrPassThru_iterBegin; - pc->iterNext = NpyArr_iterNext; - pc->iterEnd = NpyArrPassThru_iterEnd; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; +ISITERABLE: - enc->npyCtxtPassthru = NULL; - return; - } + if (PyObject_TypeCheck(obj, cls_index)) { + if (enc->outputFormat == SPLIT) { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } - else - if (PyObject_TypeCheck(obj, cls_dataframe)) - { - if (enc->blkCtxtPassthru) - { - PRINTMARK(); - pc->pdblock = enc->blkCtxtPassthru; - tc->type = (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); + pc->newObj = get_values(obj); + if (pc->newObj) { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + } else { + goto INVALID; + } - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; + return; + } else if (PyObject_TypeCheck(obj, cls_series)) { + if (enc->outputFormat == SPLIT) { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } - enc->blkCtxtPassthru = NULL; - return; - } + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } - if (enc->outputFormat == SPLIT) - { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + PRINTMARK(); + tc->type = JT_OBJECT; + tmpObj = PyObject_GetAttrString(obj, "index"); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, + pc->columnLabelsLen); + if (!pc->columnLabels) { + goto INVALID; + } + } else { + PRINTMARK(); + tc->type = JT_ARRAY; + } + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (PyArray_Check(obj)) { + if (enc->npyCtxtPassthru) { + PRINTMARK(); + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterNext = NpyArr_iterNext; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + + enc->npyCtxtPassthru = NULL; + return; + } - PRINTMARK(); - if (is_simple_frame(obj)) - { - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; - - pc->newObj = get_values(obj); - if (!pc->newObj) - { - goto INVALID; - } - } - else - { - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - } - pc->iterGetValue = NpyArr_iterGetValue; + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (PyObject_TypeCheck(obj, cls_dataframe)) { + if (enc->blkCtxtPassthru) { + PRINTMARK(); + pc->pdblock = enc->blkCtxtPassthru; + tc->type = + (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = PdBlockPassThru_iterBegin; + pc->iterEnd = PdBlockPassThru_iterEnd; + pc->iterNext = PdBlock_iterNextItem; + pc->iterGetName = PdBlock_iterGetName; + pc->iterGetValue = NpyArr_iterGetValue; + + enc->blkCtxtPassthru = NULL; + return; + } - if (enc->outputFormat == VALUES) - { - PRINTMARK(); - tc->type = JT_ARRAY; - } - else - if (enc->outputFormat == RECORDS) - { - PRINTMARK(); - tc->type = JT_ARRAY; - tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) - { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) - { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) - { - goto INVALID; - } - } - else - if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) - { - PRINTMARK(); - tc->type = JT_OBJECT; - tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index") : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) - { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) - { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) - { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) - { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) - { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } + if (enc->outputFormat == SPLIT) { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } - if (enc->outputFormat == COLUMNS) - { PRINTMARK(); - pc->transpose = 1; - } - } - else - { - goto INVALID; - } - return; - } - else - if (PyDict_Check(obj)) - { - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = obj; - Py_INCREF(obj); - - return; - } - else - if (PyList_Check(obj)) - { - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = List_iterBegin; - pc->iterEnd = List_iterEnd; - pc->iterNext = List_iterNext; - pc->iterGetValue = List_iterGetValue; - pc->iterGetName = List_iterGetName; - return; - } - else - if (PyTuple_Check(obj)) - { - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = Tuple_iterBegin; - pc->iterEnd = Tuple_iterEnd; - pc->iterNext = Tuple_iterNext; - pc->iterGetValue = Tuple_iterGetValue; - pc->iterGetName = Tuple_iterGetName; - return; - } - else - if (PyAnySet_Check(obj)) - { - PRINTMARK(); - tc->type = JT_ARRAY; - pc->iterBegin = Iter_iterBegin; - pc->iterEnd = Iter_iterEnd; - pc->iterNext = Iter_iterNext; - pc->iterGetValue = Iter_iterGetValue; - pc->iterGetName = Iter_iterGetName; - return; - } - - toDictFunc = PyObject_GetAttrString(obj, "toDict"); + if (is_simple_frame(obj)) { + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetName = NpyArr_iterGetName; + + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } + } else { + pc->iterBegin = PdBlock_iterBegin; + pc->iterEnd = PdBlock_iterEnd; + pc->iterNext = PdBlock_iterNext; + pc->iterGetName = PdBlock_iterGetName; + } + pc->iterGetValue = NpyArr_iterGetValue; + + if (enc->outputFormat == VALUES) { + PRINTMARK(); + tc->type = JT_ARRAY; + } else if (enc->outputFormat == RECORDS) { + PRINTMARK(); + tc->type = JT_ARRAY; + tmpObj = PyObject_GetAttrString(obj, "columns"); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + goto INVALID; + } + } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + PRINTMARK(); + tc->type = JT_OBJECT; + tmpObj = (enc->outputFormat == INDEX + ? PyObject_GetAttrString(obj, "index") + : PyObject_GetAttrString(obj, "columns")); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->rowLabelsLen = PyObject_Size(tmpObj); + pc->rowLabels = + NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, pc->rowLabelsLen); + Py_DECREF(tmpObj); + tmpObj = (enc->outputFormat == INDEX + ? PyObject_GetAttrString(obj, "columns") + : PyObject_GetAttrString(obj, "index")); + if (!tmpObj) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, + (JSONObjectEncoder *)enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + + if (enc->outputFormat == COLUMNS) { + PRINTMARK(); + pc->transpose = 1; + } + } else { + goto INVALID; + } + return; + } else if (PyDict_Check(obj)) { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } else if (PyList_Check(obj)) { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } else if (PyTuple_Check(obj)) { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } else if (PyAnySet_Check(obj)) { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Iter_iterBegin; + pc->iterEnd = Iter_iterEnd; + pc->iterNext = Iter_iterNext; + pc->iterGetValue = Iter_iterGetValue; + pc->iterGetName = Iter_iterGetName; + return; + } + + toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) { + PyObject *tuple = PyTuple_New(0); + PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } - if (toDictFunc) - { - PyObject* tuple = PyTuple_New(0); - PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); - Py_DECREF(tuple); - Py_DECREF(toDictFunc); + if (!PyDict_Check(toDictResult)) { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; + } - if (toDictResult == NULL) - { - PyErr_Clear(); - tc->type = JT_NULL; - return; + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; + return; } - if (!PyDict_Check(toDictResult)) - { - Py_DECREF(toDictResult); - tc->type = JT_NULL; - return; + PyErr_Clear(); + + if (enc->defaultHandler) { + Object_invokeDefaultHandler(obj, enc); + goto INVALID; } PRINTMARK(); tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = toDictResult; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; return; - } - - PyErr_Clear(); - - if (enc->defaultHandler) - { - Object_invokeDefaultHandler(obj, enc); - goto INVALID; - } - - PRINTMARK(); - tc->type = JT_OBJECT; - pc->iterBegin = Dir_iterBegin; - pc->iterEnd = Dir_iterEnd; - pc->iterNext = Dir_iterNext; - pc->iterGetValue = Dir_iterGetValue; - pc->iterGetName = Dir_iterGetName; - return; INVALID: - tc->type = JT_INVALID; - PyObject_Free(tc->prv); - tc->prv = NULL; - return; + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; } -void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) -{ - PRINTMARK(); - if(tc->prv) - { - Py_XDECREF(GET_TC(tc)->newObj); - GET_TC(tc)->newObj = NULL; - NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); - GET_TC(tc)->rowLabels = NULL; - NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); - GET_TC(tc)->columnLabels = NULL; - - PyObject_Free(GET_TC(tc)->cStr); - GET_TC(tc)->cStr = NULL; - if (tc->prv != &(((PyObjectEncoder*) tc->encoder)->basicTypeContext)) - { - PyObject_Free(tc->prv); +void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { + PRINTMARK(); + if (tc->prv) { + Py_XDECREF(GET_TC(tc)->newObj); + GET_TC(tc)->newObj = NULL; + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + GET_TC(tc)->rowLabels = NULL; + NpyArr_freeLabels(GET_TC(tc)->columnLabels, + GET_TC(tc)->columnLabelsLen); + GET_TC(tc)->columnLabels = NULL; + + PyObject_Free(GET_TC(tc)->cStr); + GET_TC(tc)->cStr = NULL; + if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT + PyObject_Free(tc->prv); + } + tc->prv = NULL; } - tc->prv = NULL; - } } -const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) -{ - return GET_TC(tc)->PyTypeToJSON (obj, tc, NULL, _outLen); +const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + return GET_TC(tc)->PyTypeToJSON(obj, tc, NULL, _outLen); } -JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) -{ - JSINT64 ret; - GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); - return ret; +JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) { + JSINT64 ret; + GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); + return ret; } -JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) -{ - JSINT32 ret; - GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); - return ret; +JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) { + JSINT32 ret; + GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); + return ret; } -double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) -{ - double ret; - GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); - return ret; +double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) { + double ret; + GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); + return ret; } -static void Object_releaseObject(JSOBJ _obj) -{ - Py_DECREF( (PyObject *) _obj); -} +static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } -void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->iterBegin(obj, tc); +void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->iterBegin(obj, tc); } -int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->iterNext(obj, tc); +int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->iterNext(obj, tc); } -void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) -{ - GET_TC(tc)->iterEnd(obj, tc); +void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->iterEnd(obj, tc); } -JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) -{ - return GET_TC(tc)->iterGetValue(obj, tc); +JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->iterGetValue(obj, tc); } -char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) -{ - return GET_TC(tc)->iterGetName(obj, tc, outLen); +char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { + return GET_TC(tc)->iterGetName(obj, tc, outLen); } -PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) -{ - static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", "date_unit", "iso_dates", "default_handler", NULL}; - - char buffer[65536]; - char *ret; - PyObject *newobj; - PyObject *oinput = NULL; - PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting - PyObject *oencodeHTMLChars = NULL; - char *sOrient = NULL; - char *sdateFormat = NULL; - PyObject *oisoDates = 0; - PyObject *odefHandler = 0; - - PyObjectEncoder pyEncoder = - { - { +PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = { + "obj", "ensure_ascii", "double_precision", "encode_html_chars", + "orient", "date_unit", "iso_dates", "default_handler", + NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + int idoublePrecision = 10; // default double precision setting + PyObject *oencodeHTMLChars = NULL; + char *sOrient = NULL; + char *sdateFormat = NULL; + PyObject *oisoDates = 0; + PyObject *odefHandler = 0; + + PyObjectEncoder pyEncoder = {{ Object_beginTypeContext, Object_endTypeContext, Object_getStringValue, @@ -2695,230 +2312,188 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) PyObject_Malloc, PyObject_Realloc, PyObject_Free, - -1, //recursionMax + -1, // recursionMax idoublePrecision, - 1, //forceAscii - 0, //encodeHTMLChars - } - }; - JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; - - pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; - pyEncoder.npyType = -1; - pyEncoder.npyValue = NULL; - pyEncoder.datetimeIso = 0; - pyEncoder.datetimeUnit = PANDAS_FR_ms; - pyEncoder.outputFormat = COLUMNS; - pyEncoder.defaultHandler = 0; - pyEncoder.basicTypeContext.newObj = NULL; - pyEncoder.basicTypeContext.dictObj = NULL; - pyEncoder.basicTypeContext.itemValue = NULL; - pyEncoder.basicTypeContext.itemName = NULL; - pyEncoder.basicTypeContext.attrList = NULL; - pyEncoder.basicTypeContext.iterator = NULL; - pyEncoder.basicTypeContext.cStr = NULL; - pyEncoder.basicTypeContext.npyarr = NULL; - pyEncoder.basicTypeContext.rowLabels = NULL; - pyEncoder.basicTypeContext.columnLabels = NULL; - - PRINTMARK(); - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, &oisoDates, &odefHandler)) - { - return NULL; - } - - if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) - { - encoder->forceASCII = 0; - } - - if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) - { - encoder->encodeHTMLChars = 1; - } - - if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) - { - PyErr_Format ( - PyExc_ValueError, - "Invalid value '%d' for option 'double_precision', max is '%u'", - idoublePrecision, - JSON_DOUBLE_MAX_DECIMALS); - return NULL; - } - encoder->doublePrecision = idoublePrecision; - - if (sOrient != NULL) - { - if (strcmp(sOrient, "records") == 0) - { - pyEncoder.outputFormat = RECORDS; - } - else - if (strcmp(sOrient, "index") == 0) - { - pyEncoder.outputFormat = INDEX; - } - else - if (strcmp(sOrient, "split") == 0) - { - pyEncoder.outputFormat = SPLIT; - } - else - if (strcmp(sOrient, "values") == 0) - { - pyEncoder.outputFormat = VALUES; - } - else - if (strcmp(sOrient, "columns") != 0) - { - PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'orient'", sOrient); - return NULL; - } - } + 1, // forceAscii + 0, // encodeHTMLChars + }}; + JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.blkCtxtPassthru = NULL; + pyEncoder.npyType = -1; + pyEncoder.npyValue = NULL; + pyEncoder.datetimeIso = 0; + pyEncoder.datetimeUnit = PANDAS_FR_ms; + pyEncoder.outputFormat = COLUMNS; + pyEncoder.defaultHandler = 0; + pyEncoder.basicTypeContext.newObj = NULL; + pyEncoder.basicTypeContext.dictObj = NULL; + pyEncoder.basicTypeContext.itemValue = NULL; + pyEncoder.basicTypeContext.itemName = NULL; + pyEncoder.basicTypeContext.attrList = NULL; + pyEncoder.basicTypeContext.iterator = NULL; + pyEncoder.basicTypeContext.cStr = NULL; + pyEncoder.basicTypeContext.npyarr = NULL; + pyEncoder.basicTypeContext.rowLabels = NULL; + pyEncoder.basicTypeContext.columnLabels = NULL; - if (sdateFormat != NULL) - { - if (strcmp(sdateFormat, "s") == 0) - { - pyEncoder.datetimeUnit = PANDAS_FR_s; - } - else - if (strcmp(sdateFormat, "ms") == 0) - { - pyEncoder.datetimeUnit = PANDAS_FR_ms; + PRINTMARK(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput, + &oensureAscii, &idoublePrecision, + &oencodeHTMLChars, &sOrient, &sdateFormat, + &oisoDates, &odefHandler)) { + return NULL; } - else - if (strcmp(sdateFormat, "us") == 0) - { - pyEncoder.datetimeUnit = PANDAS_FR_us; + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { + encoder->forceASCII = 0; } - else - if (strcmp(sdateFormat, "ns") == 0) - { - pyEncoder.datetimeUnit = PANDAS_FR_ns; + + if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { + encoder->encodeHTMLChars = 1; } - else - { - PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", sdateFormat); - return NULL; + + if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { + PyErr_Format( + PyExc_ValueError, + "Invalid value '%d' for option 'double_precision', max is '%u'", + idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); + return NULL; } - } + encoder->doublePrecision = idoublePrecision; - if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) - { - pyEncoder.datetimeIso = 1; - } + if (sOrient != NULL) { + if (strcmp(sOrient, "records") == 0) { + pyEncoder.outputFormat = RECORDS; + } else if (strcmp(sOrient, "index") == 0) { + pyEncoder.outputFormat = INDEX; + } else if (strcmp(sOrient, "split") == 0) { + pyEncoder.outputFormat = SPLIT; + } else if (strcmp(sOrient, "values") == 0) { + pyEncoder.outputFormat = VALUES; + } else if (strcmp(sOrient, "columns") != 0) { + PyErr_Format(PyExc_ValueError, + "Invalid value '%s' for option 'orient'", sOrient); + return NULL; + } + } + if (sdateFormat != NULL) { + if (strcmp(sdateFormat, "s") == 0) { + pyEncoder.datetimeUnit = PANDAS_FR_s; + } else if (strcmp(sdateFormat, "ms") == 0) { + pyEncoder.datetimeUnit = PANDAS_FR_ms; + } else if (strcmp(sdateFormat, "us") == 0) { + pyEncoder.datetimeUnit = PANDAS_FR_us; + } else if (strcmp(sdateFormat, "ns") == 0) { + pyEncoder.datetimeUnit = PANDAS_FR_ns; + } else { + PyErr_Format(PyExc_ValueError, + "Invalid value '%s' for option 'date_unit'", + sdateFormat); + return NULL; + } + } - if (odefHandler != NULL && odefHandler != Py_None) - { - if (!PyCallable_Check(odefHandler)) - { - PyErr_SetString (PyExc_TypeError, "Default handler is not callable"); - return NULL; + if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { + pyEncoder.datetimeIso = 1; } - pyEncoder.defaultHandler = odefHandler; - } - pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - PRINTMARK(); - ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); - PRINTMARK(); + if (odefHandler != NULL && odefHandler != Py_None) { + if (!PyCallable_Check(odefHandler)) { + PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); + return NULL; + } + pyEncoder.defaultHandler = odefHandler; + } - if (PyErr_Occurred()) - { + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; PRINTMARK(); - return NULL; - } - - if (encoder->errorMsg) - { + ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (ret != buffer) - { - encoder->free (ret); + + if (PyErr_Occurred()) { + PRINTMARK(); + return NULL; } - PyErr_Format (PyExc_OverflowError, "%s", encoder->errorMsg); - return NULL; - } + if (encoder->errorMsg) { + PRINTMARK(); + if (ret != buffer) { + encoder->free(ret); + } + + PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } - newobj = PyString_FromString (ret); + newobj = PyString_FromString(ret); - if (ret != buffer) - { - encoder->free (ret); - } + if (ret != buffer) { + encoder->free(ret); + } - PRINTMARK(); + PRINTMARK(); - return newobj; + return newobj; } -PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) -{ - PyObject *data; - PyObject *file; - PyObject *string; - PyObject *write; - PyObject *argtuple; +PyObject *objToJSONFile(PyObject *self, PyObject *args, PyObject *kwargs) { + PyObject *data; + PyObject *file; + PyObject *string; + PyObject *write; + PyObject *argtuple; - PRINTMARK(); + PRINTMARK(); - if (!PyArg_ParseTuple (args, "OO", &data, &file)) - { - return NULL; - } + if (!PyArg_ParseTuple(args, "OO", &data, &file)) { + return NULL; + } - if (!PyObject_HasAttrString (file, "write")) - { - PyErr_Format (PyExc_TypeError, "expected file"); - return NULL; - } + if (!PyObject_HasAttrString(file, "write")) { + PyErr_Format(PyExc_TypeError, "expected file"); + return NULL; + } - write = PyObject_GetAttrString (file, "write"); + write = PyObject_GetAttrString(file, "write"); - if (!PyCallable_Check (write)) - { - Py_XDECREF(write); - PyErr_Format (PyExc_TypeError, "expected file"); - return NULL; - } + if (!PyCallable_Check(write)) { + Py_XDECREF(write); + PyErr_Format(PyExc_TypeError, "expected file"); + return NULL; + } - argtuple = PyTuple_Pack(1, data); + argtuple = PyTuple_Pack(1, data); - string = objToJSON (self, argtuple, kwargs); + string = objToJSON(self, argtuple, kwargs); + + if (string == NULL) { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } - if (string == NULL) - { - Py_XDECREF(write); Py_XDECREF(argtuple); - return NULL; - } - Py_XDECREF(argtuple); + argtuple = PyTuple_Pack(1, string); + if (argtuple == NULL) { + Py_XDECREF(write); + return NULL; + } + if (PyObject_CallObject(write, argtuple) == NULL) { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } - argtuple = PyTuple_Pack (1, string); - if (argtuple == NULL) - { - Py_XDECREF(write); - return NULL; - } - if (PyObject_CallObject (write, argtuple) == NULL) - { Py_XDECREF(write); - Py_XDECREF(argtuple); - return NULL; - } + Py_DECREF(argtuple); + Py_XDECREF(string); - Py_XDECREF(write); - Py_DECREF(argtuple); - Py_XDECREF(string); - - PRINTMARK(); + PRINTMARK(); - Py_RETURN_NONE; + Py_RETURN_NONE; } diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h index 723eaed336f6b..b32285766c86a 100644 --- a/pandas/src/ujson/python/py_defines.h +++ b/pandas/src/ujson/python/py_defines.h @@ -35,6 +35,9 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +#ifndef PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ +#define PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ + #include #if PY_MAJOR_VERSION >= 3 @@ -51,3 +54,5 @@ Numeric decoder derived from from TCL library #define PyString_FromString PyUnicode_FromString #endif + +#endif // PANDAS_SRC_UJSON_PYTHON_PY_DEFINES_H_ diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c index 48ea92ed3bc8c..8c25975f12409 100644 --- a/pandas/src/ujson/python/ujson.c +++ b/pandas/src/ujson/python/ujson.c @@ -39,74 +39,84 @@ Numeric decoder derived from from TCL library #include "version.h" /* objToJSON */ -PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs); +PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs); void initObjToJSON(void); /* JSONToObj */ -PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs); +PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); /* objToJSONFile */ -PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs); +PyObject *objToJSONFile(PyObject *self, PyObject *args, PyObject *kwargs); /* JSONFileToObj */ -PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs); +PyObject *JSONFileToObj(PyObject *self, PyObject *args, PyObject *kwargs); - -#define ENCODER_HELP_TEXT "Use ensure_ascii=false to output UTF-8. Pass in double_precision to alter the maximum digit precision of doubles. Set encode_html_chars=True to encode < > & as unicode escape sequences." +#define ENCODER_HELP_TEXT \ + "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ + "alter the maximum digit precision of doubles. Set " \ + "encode_html_chars=True to encode < > & as unicode escape sequences." static PyMethodDef ujsonMethods[] = { - {"encode", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, - {"decode", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True to use high precision float decoder."}, - {"dumps", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, - {"loads", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True to use high precision float decoder."}, - {"dump", (PyCFunction) objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. " ENCODER_HELP_TEXT}, - {"load", (PyCFunction) JSONFileToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as file to dict object structure. Use precise_float=True to use high precision float decoder."}, - {NULL, NULL, 0, NULL} /* Sentinel */ + {"encode", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, + "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, + {"decode", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS, + "Converts JSON as string to dict object structure. Use precise_float=True " + "to use high precision float decoder."}, + {"dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, + "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, + {"loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS, + "Converts JSON as string to dict object structure. Use precise_float=True " + "to use high precision float decoder."}, + {"dump", (PyCFunction)objToJSONFile, METH_VARARGS | METH_KEYWORDS, + "Converts arbitrary object recursively into JSON " + "file. " ENCODER_HELP_TEXT}, + {"load", (PyCFunction)JSONFileToObj, METH_VARARGS | METH_KEYWORDS, + "Converts JSON as file to dict object structure. Use precise_float=True " + "to use high precision float decoder."}, + {NULL, NULL, 0, NULL} /* Sentinel */ }; #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "_pandasujson", - 0, /* m_doc */ - -1, /* m_size */ - ujsonMethods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL /* m_free */ + PyModuleDef_HEAD_INIT, + "_pandasujson", + 0, /* m_doc */ + -1, /* m_size */ + ujsonMethods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ }; -#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) -#define PYMODULE_CREATE() PyModule_Create(&moduledef) -#define MODINITERROR return NULL +#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) +#define PYMODULE_CREATE() PyModule_Create(&moduledef) +#define MODINITERROR return NULL #else -#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) -#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) -#define MODINITERROR return +#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) +#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) +#define MODINITERROR return #endif -PYMODINITFUNC -{ - PyObject *module; - PyObject *version_string; +PYMODINITFUNC { + PyObject *module; + PyObject *version_string; - initObjToJSON(); - module = PYMODULE_CREATE(); + initObjToJSON(); + module = PYMODULE_CREATE(); - if (module == NULL) - { - MODINITERROR; - } + if (module == NULL) { + MODINITERROR; + } - version_string = PyString_FromString (UJSON_VERSION); - PyModule_AddObject (module, "__version__", version_string); + version_string = PyString_FromString(UJSON_VERSION); + PyModule_AddObject(module, "__version__", version_string); #if PY_MAJOR_VERSION >= 3 - return module; + return module; #endif } diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h index 2d4fd137edefe..c074ef572101d 100644 --- a/pandas/src/ujson/python/version.h +++ b/pandas/src/ujson/python/version.h @@ -35,4 +35,9 @@ Numeric decoder derived from from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +#ifndef PANDAS_SRC_UJSON_PYTHON_VERSION_H_ +#define PANDAS_SRC_UJSON_PYTHON_VERSION_H_ + #define UJSON_VERSION "1.33" + +#endif // PANDAS_SRC_UJSON_PYTHON_VERSION_H_ From 33e11ade7d1c5753ba3538e539e0917db73b257e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 15 Dec 2016 06:24:26 -0500 Subject: [PATCH 178/183] API: Return sparse objects always for cumsum Always return `SparseArray` and `SparseSeries` for `SparseArray.cumsum()` and `SparseSeries.cumsum()` respectively, regardless of `fill_value`. Closes #12855. Author: gfyoung Closes #14771 from gfyoung/sparse-return-type and squashes the following commits: 83314fc [gfyoung] API: Return sparse objects always for cumsum --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/sparse/array.py | 23 ++++++--- pandas/sparse/series.py | 28 +++++++---- pandas/sparse/tests/test_array.py | 75 +++++++++++++++++------------- pandas/sparse/tests/test_series.py | 9 ++-- 5 files changed, 84 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6ee97c555f5ed..1d226a2a9a5ca 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -90,6 +90,7 @@ Backwards incompatible API changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) +- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 4bb36446c9ff7..da13726e88a14 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -620,19 +620,30 @@ def sum(self, axis=0, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int or None + Axis over which to perform the cumulative summation. If None, + perform cumulative summation over flattened array. Returns ------- - cumsum : Series + cumsum : SparseArray """ nv.validate_cumsum(args, kwargs) - # TODO: gh-12855 - return a SparseArray here - if notnull(self.fill_value): - return self.to_dense().cumsum() + if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. + raise ValueError("axis(={axis}) out of bounds".format(axis=axis)) + + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() - # TODO: what if sp_values contains NaN?? return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, fill_value=self.fill_value) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 660f76ff1001d..d6bc892921c42 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -630,21 +630,29 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseSeries will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : {0} Returns ------- - cumsum : SparseSeries if `self` has a null `fill_value` and a - generic Series otherwise + cumsum : SparseSeries """ nv.validate_cumsum(args, kwargs) - new_array = SparseArray.cumsum(self.values) - if isinstance(new_array, SparseArray): - return self._constructor( - new_array, index=self.index, - sparse_index=new_array.sp_index).__finalize__(self) - # TODO: gh-12855 - return a SparseSeries here - return Series(new_array, index=self.index).__finalize__(self) + if axis is not None: + axis = self._get_axis_number(axis) + + new_array = self.values.cumsum() + + return self._constructor( + new_array, index=self.index, + sparse_index=new_array.sp_index).__finalize__(self) @Appender(generic._shared_docs['isnull']) def isnull(self): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index f210f70ad1940..bd896ae5b86d9 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -688,46 +688,57 @@ def test_numpy_sum(self): SparseArray(data), out=out) def test_cumsum(self): - data = np.arange(10).astype(float) - out = SparseArray(data).cumsum() - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = SparseArray(data, fill_value=2).cumsum() - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) + + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = SparseArray(data).cumsum() + tm.assert_sp_array_equal(out, expected) + + out = SparseArray(data, fill_value=np.nan).cumsum() + tm.assert_sp_array_equal(out, expected) - out = SparseArray(data, fill_value=np.nan).cumsum() - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + out = SparseArray(data, fill_value=2).cumsum() + tm.assert_sp_array_equal(out, expected) + + axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid. + msg = "axis\(={axis}\) out of bounds".format(axis=axis) + with tm.assertRaisesRegexp(ValueError, msg): + SparseArray(data).cumsum(axis=axis) def test_numpy_cumsum(self): - data = np.arange(10).astype(float) - out = np.cumsum(SparseArray(data)) - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = np.cumsum(SparseArray(data, fill_value=2)) - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) - out = np.cumsum(SparseArray(data, fill_value=np.nan)) - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = np.cumsum(SparseArray(data)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'dtype' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), dtype=np.int64) + out = np.cumsum(SparseArray(data, fill_value=np.nan)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), out=out) + out = np.cumsum(SparseArray(data, fill_value=2)) + tm.assert_sp_array_equal(out, expected) + + msg = "the 'dtype' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), out=out) def test_mean(self): data = np.arange(10).astype(float) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 116596e36b402..14339ab388a5d 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1331,21 +1331,22 @@ def test_cumsum(self): expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = self.zbseries.cumsum() expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) + axis = 1 # Series is 1-D, so only axis = 0 is valid. + msg = "No axis named {axis}".format(axis=axis) + with tm.assertRaisesRegexp(ValueError, msg): + self.bseries.cumsum(axis=axis) + def test_numpy_cumsum(self): result = np.cumsum(self.bseries) expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = np.cumsum(self.zbseries) expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) msg = "the 'dtype' parameter is not supported" From 9ab404648e005222ea1af66fb9911c3aca96e6ab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Dec 2016 06:48:35 -0500 Subject: [PATCH 179/183] DOC: doc changes in whatsnew 0.20.0 --- doc/source/whatsnew/v0.20.0.txt | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 1d226a2a9a5ca..508093380ac81 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -23,11 +23,10 @@ New features ~~~~~~~~~~~~ -``dtype`` keyword for data io +``dtype`` keyword for data IO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns - is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. .. ipython:: python @@ -44,14 +43,12 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. pd.read_fwf(StringIO(data)).dtypes pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes -.. _whatsnew_0200.enhancements.other: +.. _whatsnew_0200.enhancements.groupby_access: -Other enhancements -^^^^^^^^^^^^^^^^^^ -- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) +Groupby Enhancements +^^^^^^^^^^^^^^^^^^^^ -- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) +Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) .. ipython:: python @@ -63,17 +60,25 @@ Other enhancements df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], 'B': np.arange(8)}, index=index) + df df.groupby(['second', 'A']).sum() +.. _whatsnew_0200.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ +- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) + +- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack of sorting or an incorrect key. See :ref:`here ` -- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) +- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) @@ -98,8 +103,6 @@ Backwards incompatible API changes Other API Changes ^^^^^^^^^^^^^^^^^ -- Change error message text when indexing via a - boolean ``Series`` that has an incompatible index (:issue:`14491`) .. _whatsnew_0200.deprecations: From 49e3137a0dbb4458e98d3e94d76b80ec8a390793 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Dec 2016 06:50:30 -0500 Subject: [PATCH 180/183] DOC: whatsnew 0.19.2 --- doc/source/whatsnew/v0.19.2.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index dabc6036fc9ba..2f34401d88e2a 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -25,8 +25,8 @@ Performance Improvements .. _whatsnew_0192.enhancements.other: -Other enhancements -^^^^^^^^^^^^^^^^^^ +Other Enhancements +~~~~~~~~~~~~~~~~~~ - ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`) From 033d34596f5327472007f0f15029b86050ee2592 Mon Sep 17 00:00:00 2001 From: Keshav Ramaswamy Date: Thu, 15 Dec 2016 10:40:38 -0500 Subject: [PATCH 181/183] Fixed KDE Plot to drop the missing values (#14820) BUG: Fixed KDE plot to ignore missing values closes #14821 * fixed kde plot to ignore the missing values * added comment to elaborate the changes made * added a release note in whatsnew/0.19.2 * added test to check for missing values and cleaned up whatsnew doc * added comment to refer the issue * modified to fit lint checks * replaced ._xorig with .get_xdata() --- doc/source/whatsnew/v0.19.2.txt | 2 ++ pandas/tests/plotting/test_series.py | 6 +++++- pandas/tools/plotting.py | 7 ++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 2f34401d88e2a..82d43db667550 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -88,4 +88,6 @@ Bug Fixes - Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) +- Bug in ``.plot(kind='kde')`` which did not drop missing values to generate the KDE Plot, instead generating an empty plot. (:issue:`14821`) + - Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index e752197c6ad77..73119fec88198 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -569,7 +569,11 @@ def test_kde_missing_vals(self): _skip_if_no_scipy_gaussian_kde() s = Series(np.random.uniform(size=50)) s[0] = np.nan - _check_plot_works(s.plot.kde) + axes = _check_plot_works(s.plot.kde) + # check if the values have any missing values + # GH14821 + self.assertTrue(any(~np.isnan(axes.lines[0].get_xdata())), + msg='Missing Values not dropped') @slow def test_hist_kwargs(self): diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 21e8b64a3656a..bd9933b12b580 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2153,9 +2153,10 @@ def _args_adjust(self): def _get_ind(self, y): if self.ind is None: - sample_range = max(y) - min(y) - ind = np.linspace(min(y) - 0.5 * sample_range, - max(y) + 0.5 * sample_range, 1000) + # np.nanmax() and np.nanmin() ignores the missing values + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, 1000) else: ind = self.ind return ind From 3ba2cff9c55cd16b172f9feb09da551990753f3b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 15 Dec 2016 18:08:15 -0500 Subject: [PATCH 182/183] PERF: use StringHasTable for strings xref #13745 provides a modest speedup for all string hashing. The key thing is, it will release the GIL on more operations where this is possible (mainly factorize). can be easily extended to value_counts() and .duplicated() (for strings) Author: Jeff Reback Closes #14859 from jreback/string and squashes the following commits: 98f46c2 [Jeff Reback] PERF: use StringHashTable for strings in factorizing --- asv_bench/benchmarks/algorithms.py | 5 + asv_bench/benchmarks/gil.py | 35 +++ doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/algorithms.py | 33 ++- pandas/hashtable.pxd | 8 +- pandas/hashtable.pyx | 8 +- pandas/src/hashtable_class_helper.pxi.in | 305 ++++++++++++++++++----- 7 files changed, 330 insertions(+), 66 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index c4a6117c0704a..20d149493951f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -8,6 +8,7 @@ class Algorithms(object): def setup(self): N = 100000 + np.random.seed(1234) self.int_unique = pd.Int64Index(np.arange(N * 5)) # cache is_unique @@ -23,11 +24,15 @@ def setup(self): self.arrpos = np.arange(1000000) self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) + self.strings = tm.makeStringIndex(100000) # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) + def time_factorize_string(self): + self.strings.factorize() + def time_factorize_int(self): self.int.factorize() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 3f53894364cd2..1c5e59672cb57 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -379,3 +379,38 @@ def pg_read_csv_datetime(self): def time_read_csv_datetime(self): self.pg_read_csv_datetime() + + +class nogil_factorize(object): + number = 1 + repeat = 5 + + def setup(self): + if (not have_real_test_parallel): + raise NotImplementedError + + np.random.seed(1234) + self.strings = tm.makeStringIndex(100000) + + def factorize_strings(self): + pd.factorize(self.strings) + + @test_parallel(num_threads=4) + def _pg_factorize_strings_4(self): + self.factorize_strings() + + def time_factorize_strings_4(self): + for i in range(2): + self._pg_factorize_strings_4() + + @test_parallel(num_threads=2) + def _pg_factorize_strings_2(self): + self.factorize_strings() + + def time_factorize_strings_2(self): + for i in range(4): + self._pg_factorize_strings_2() + + def time_factorize_strings(self): + for i in range(8): + self.factorize_strings() diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 508093380ac81..2855cde95ac2a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -135,7 +135,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) - +- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index effca6398419e..0d4d4143e6b9b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) - result = _hashtable_algo(f, values.dtype, np.int64) + result = _hashtable_algo(f, values, np.int64) if na_sentinel != -1: @@ -102,7 +102,7 @@ def unique(values): values = com._asarray_tuplesafe(values) f = lambda htype, caster: _unique_generic(values, htype, caster) - return _hashtable_algo(f, values.dtype) + return _hashtable_algo(f, values) def _unique_generic(values, table_type, type_caster): @@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): # helpers # # ------- # -def _hashtable_algo(f, dtype, return_dtype=None): +def _hashtable_algo(f, values, return_dtype=None): """ f(HashTable, type_caster) -> result """ + + dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): @@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None): elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - else: - return f(htable.PyObjectHashTable, _ensure_object) + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + return f(htable.StringHashTable, _ensure_object) + + # use Object + return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), 'int64': (htable.Int64HashTable, htable.Int64Vector), + 'string': (htable.StringHashTable, htable.ObjectVector), 'generic': (htable.PyObjectHashTable, htable.ObjectVector) } def _get_data_algo(values, func_map): + + f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -796,8 +806,19 @@ def _get_data_algo(values, func_map): f = func_map['int64'] values = _ensure_int64(values) else: - f = func_map['generic'] + values = _ensure_object(values) + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + try: + f = func_map['string'] + except KeyError: + pass + + if f is None: + f = func_map['generic'] + return f, values diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index 97b6687d061e9..f3ea7ad792160 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,4 +1,4 @@ -from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t +from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t # prototypes for sharing @@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 3bda3f49cb054..ce760b49fabc0 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check from khash cimport * from numpy cimport * -from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free + +from libc.stdlib cimport malloc, free +from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free, + PyString_Check, PyBytes_Check, + PyUnicode_Check) from util cimport _checknan cimport util @@ -33,7 +37,7 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) -cdef size_t _INIT_VEC_CAP = 32 +cdef size_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 14e5363eee20c..22714e6305677 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -10,23 +10,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype -dtypes = [('Float64', 'float64'), ('Int64', 'int64')] - +# name, dtype, arg +# the generated StringVector is not actually used +# but is included for completeness (rather ObjectVector is used +# for uniques in hashtables) + +dtypes = [('Float64', 'float64', 'float64_t'), + ('Int64', 'int64', 'int64_t'), + ('String', 'string', 'char *')] }} -{{for name, dtype in dtypes}} +{{for name, dtype, arg in dtypes}} ctypedef struct {{name}}VectorData: - {{dtype}}_t *data + {{arg}} *data size_t n, m @cython.wraparound(False) @cython.boundscheck(False) -cdef void append_data_{{dtype}}({{name}}VectorData *data, - {{dtype}}_t x) nogil: +cdef inline void append_data_{{dtype}}({{name}}VectorData *data, + {{arg}} x) nogil: data.data[data.n] = x data.n += 1 @@ -36,8 +41,9 @@ cdef void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData Float64VectorData + StringVectorData -cdef bint needs_resize(vector_data *data) nogil: +cdef inline bint needs_resize(vector_data *data) nogil: return data.n == data.m #---------------------------------------------------------------------- @@ -46,12 +52,13 @@ cdef bint needs_resize(vector_data *data) nogil: {{py: -# name, dtype -dtypes = [('Float64', 'float64'), ('Int64', 'int64')] +# name, dtype, arg, idtype +dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), + ('Int64', 'int64', 'int64_t', 'np.int64')] }} -{{for name, dtype in dtypes}} +{{for name, dtype, arg, idtype in dtypes}} cdef class {{name}}Vector: @@ -66,13 +73,13 @@ cdef class {{name}}Vector: raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) - self.data.data = <{{dtype}}_t*> self.ao.data + self.ao = np.empty(self.data.m, dtype={{idtype}}) + self.data.data = <{{arg}}*> self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m) - self.data.data = <{{dtype}}_t*> self.ao.data + self.data.data = <{{arg}}*> self.ao.data def __dealloc__(self): PyMem_Free(self.data) @@ -85,7 +92,7 @@ cdef class {{name}}Vector: self.data.m = self.data.n return self.ao - cdef inline void append(self, {{dtype}}_t x): + cdef inline void append(self, {{arg}} x): if needs_resize(self.data): self.resize() @@ -94,6 +101,61 @@ cdef class {{name}}Vector: {{endfor}} +cdef class StringVector: + + cdef: + StringVectorData *data + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(StringVectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.data.data = malloc(self.data.m * sizeof(char *)) + + cdef resize(self): + cdef: + char **orig_data + size_t i, m + + m = self.data.m + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + + # TODO: can resize? + orig_data = self.data.data + self.data.data = malloc(self.data.m * sizeof(char *)) + for i in range(m): + self.data.data[i] = orig_data[i] + + def __dealloc__(self): + free(self.data.data) + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + cdef: + ndarray ao + size_t n + object val + + ao = np.empty(self.data.n, dtype=np.object) + for i in range(self.data.n): + val = self.data.data[i] + ao[i] = val + self.data.m = self.data.n + return ao + + cdef inline void append(self, char * x): + + if needs_resize(self.data): + self.resize() + + append_data_string(self.data, x) + cdef class ObjectVector: @@ -377,9 +439,11 @@ cdef class {{name}}HashTable(HashTable): cdef class StringHashTable(HashTable): - cdef kh_str_t *table + # these by-definition *must* be strings + # or a sentinel np.nan / None missing value + na_string_sentinel = '__nan__' - def __cinit__(self, int size_hint=1): + def __init__(self, int size_hint=1): self.table = kh_init_str() if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -388,17 +452,26 @@ cdef class StringHashTable(HashTable): kh_destroy_str(self.table) cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) + cdef: + khiter_t k + char *v + v = util.get_c_string(val) + + k = kh_get_str(self.table, v) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val + cdef: + Py_ssize_t i, val + char *v + + v = util.get_c_string(key) + for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) + k = kh_get_str(self.table, v) if k != self.table.n_buckets: val = self.table.vals[k] @@ -406,83 +479,203 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k int ret = 0 - char* buf + char *v - buf = util.get_c_string(key) + v = util.get_c_string(val) - k = kh_put_str(self.table, buf, &ret) + k = kh_put_str(self.table, v, &ret) self.table.keys[k] = key if kh_exist_str(self.table, k): self.table.vals[k] = val else: raise KeyError(key) + @cython.boundscheck(False) def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf int64_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table + char *v + char **vecs + vecs = malloc(n * sizeof(char *)) for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 + val = values[i] + v = util.get_c_string(val) + vecs[i] = v + + with nogil: + for i in range(n): + k = kh_get_str(table, vecs[i]) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + + free(vecs) return labels + @cython.boundscheck(False) def unique(self, ndarray[object] values): cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, count, n = len(values) + int64_t[:] uindexer int ret = 0 object val - char *buf + ObjectVector uniques khiter_t k - ObjectVector uniques = ObjectVector() + char *v + char **vecs + vecs = malloc(n * sizeof(char *)) + uindexer = np.empty(n, dtype=np.int64) for i in range(n): val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) + v = util.get_c_string(val) + vecs[i] = v + + count = 0 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k == self.table.n_buckets: + kh_put_str(self.table, v, &ret) + uindexer[count] = i + count += 1 + free(vecs) + # uniques + uniques = ObjectVector() + for i in range(count): + uniques.append(values[uindexer[i]]) return uniques.to_array() def factorize(self, ndarray[object] values): + uniques = ObjectVector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def lookup(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 int ret = 0 object val - char *buf + char *v khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + # these by-definition *must* be strings + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) + v = util.get_c_string(self.na_string_sentinel) + vecs[i] = v - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 - return reverse, labels + free(vecs) + return np.asarray(locs) + @cython.boundscheck(False) + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *v + char **vecs + khiter_t k + + # these by-definition *must* be strings + vecs = malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) + else: + v = util.get_c_string(self.na_string_sentinel) + vecs[i] = v + + with nogil: + for i in range(n): + v = vecs[i] + k = kh_put_str(self.table, v, &ret) + self.table.vals[k] = i + free(vecs) + + @cython.boundscheck(False) + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=1): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + int64_t[:] uindexer + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + char *v + char **vecs + khiter_t k + + # these by-definition *must* be strings + labels = np.zeros(n, dtype=np.int64) + uindexer = np.empty(n, dtype=np.int64) + + # pre-filter out missing + # and assign pointers + vecs = malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if PyUnicode_Check(val) or PyString_Check(val): + v = util.get_c_string(val) + vecs[i] = v + else: + labels[i] = na_sentinel + + # compute + with nogil: + for i in range(n): + if labels[i] == na_sentinel: + continue + + v = vecs[i] + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, v, &ret) + self.table.vals[k] = count + uindexer[count] = i + labels[i] = count + count += 1 + + free(vecs) + + # uniques + for i in range(count): + uniques.append(values[uindexer[i]]) + + return np.asarray(labels) na_sentinel = object @@ -639,4 +832,4 @@ cdef class PyObjectHashTable(HashTable): labels[i] = count count += 1 - return np.asarray(labels) \ No newline at end of file + return np.asarray(labels) From e62eeac15168bcbd81e423604b882cc0bf225966 Mon Sep 17 00:00:00 2001 From: Ashish Date: Sat, 17 Dec 2016 12:21:32 +0400 Subject: [PATCH 183/183] adding test and moving unique bins code. --- pandas/tools/tests/test_tile.py | 7 +++++++ pandas/tools/tile.py | 22 ++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 33d2a01b1256e..720edb2e3f529 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -272,6 +272,13 @@ def test_series_retbins(self): np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + def test_qcut_duplicates_drop(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + cats = qcut(values, 3, duplicates='drop') + ex_levels = ['[0, 1]', '(1, 3]'] + self.assertTrue((cats.categories == ex_levels).all()) + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 9415129947161..ef71d1b47b158 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -120,7 +120,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, def qcut(x, q, labels=None, retbins=False, precision=3, - duplicate_edges='raise'): + duplicates='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -177,12 +177,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3, bins = algos.quantile(x, quantiles) return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, precision=precision, include_lowest=True, - duplicate_edges='raise') + duplicates=duplicates) def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False, - duplicate_edges='raise'): + duplicates='raise'): x_is_series = isinstance(x, Series) series_index = None @@ -193,16 +193,22 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, x = np.asarray(x) - side = 'left' if right else 'right' - ids = bins.searchsorted(x, side=side) + if duplicates not in ['raise', 'drop']: + raise ValueError("invalid value for 'duplicates' parameter, " + + "valid options are: raise, drop") if len(algos.unique(bins)) < len(bins): - if (duplicate_edges == 'raise'): - raise ValueError('Bin edges must be unique: %s' - % repr(bins)) + if duplicates == 'raise': + raise ValueError('Bin edges must be unique: %s'% repr(bins) + + ' You can drop duplicate edges ' + + 'by setting \'duplicates\' param' + ) else: bins = algos.unique(bins) + side = 'left' if right else 'right' + ids = bins.searchsorted(x, side=side) + if include_lowest: ids[x == bins[0]] = 1