From 8adc68f2c7f0692776b4438162fe311d0db0fbe3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Nov 2017 14:47:02 -0700 Subject: [PATCH 01/10] Masking and overflow checks for datetimeindex and timedeltaindex ops (#18020) closes #17991 (cherry picked from commit 8388a47b7b09d345f463fe5fe91f32e87f7bb550) --- doc/source/whatsnew/v0.21.1.txt | 3 ++ pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 6 ++-- pandas/core/indexes/timedeltas.py | 3 +- .../tests/indexes/datetimes/test_datetime.py | 34 +++++++++++++++++++ pandas/tests/indexes/timedeltas/test_ops.py | 20 +++++++++++ 6 files changed, 64 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 00726a4606cf7..4a1a52082a83d 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -85,6 +85,9 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 71de6c7c3e8cf..4e9b2b9a2e922 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -681,7 +681,7 @@ def __sub__(self, other): return self._add_delta(-other) elif is_integer(other): return self.shift(-other) - elif isinstance(other, datetime): + elif isinstance(other, (datetime, np.datetime64)): return self._sub_datelike(other) elif isinstance(other, Period): return self._sub_period(other) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 50085889ad88f..3c518017a8808 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -31,6 +31,7 @@ import pandas.core.dtypes.concat as _concat from pandas.errors import PerformanceWarning from pandas.core.common import _values_from_object, _maybe_box +from pandas.core.algorithms import checked_add_with_arr from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index, Float64Index @@ -762,7 +763,7 @@ def _sub_datelike(self, other): raise TypeError("DatetimeIndex subtraction must have the same " "timezones or no timezones") result = self._sub_datelike_dti(other) - elif isinstance(other, datetime): + elif isinstance(other, (datetime, np.datetime64)): other = Timestamp(other) if other is libts.NaT: result = self._nat_new(box=False) @@ -772,7 +773,8 @@ def _sub_datelike(self, other): "timezones or no timezones") else: i8 = self.asi8 - result = i8 - other.value + result = checked_add_with_arr(i8, -other.value, + arr_mask=self._isnan) result = self._maybe_mask_results(result, fill_value=libts.iNaT) else: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 445adb6bd3b18..0cc35300f0d17 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -381,7 +381,8 @@ def _add_datelike(self, other): else: other = Timestamp(other) i8 = self.asi8 - result = checked_add_with_arr(i8, other.value) + result = checked_add_with_arr(i8, other.value, + arr_mask=self._isnan) result = self._maybe_mask_results(result, fill_value=iNaT) return DatetimeIndex(result, name=self.name, copy=False) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 8d9ac59cf9883..20a9916ad6bc4 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -211,6 +211,40 @@ def test_ufunc_coercions(self): tm.assert_index_equal(result, exp) assert result.freq == 'D' + def test_datetimeindex_sub_timestamp_overflow(self): + dtimax = pd.to_datetime(['now', pd.Timestamp.max]) + dtimin = pd.to_datetime(['now', pd.Timestamp.min]) + + tsneg = Timestamp('1950-01-01') + ts_neg_variants = [tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype('datetime64[ns]'), + tsneg.to_datetime64().astype('datetime64[D]')] + + tspos = Timestamp('1980-01-01') + ts_pos_variants = [tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype('datetime64[ns]'), + tspos.to_datetime64().astype('datetime64[D]')] + + for variant in ts_neg_variants: + with pytest.raises(OverflowError): + dtimax - variant + + expected = pd.Timestamp.max.value - tspos.value + for variant in ts_pos_variants: + res = dtimax - variant + assert res[1].value == expected + + expected = pd.Timestamp.min.value - tsneg.value + for variant in ts_neg_variants: + res = dtimin - variant + assert res[1].value == expected + + for variant in ts_pos_variants: + with pytest.raises(OverflowError): + dtimin - variant + def test_week_of_month_frequency(self): # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise d1 = date(2002, 9, 1) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index f4f669ee1d087..3cf56dc5115c2 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1282,3 +1282,23 @@ def test_add_overflow(self): result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) tm.assert_index_equal(result, exp) + + def test_timedeltaindex_add_timestamp_nat_masking(self): + # GH17991 checking for overflow-masking with NaT + tdinat = pd.to_timedelta(['24658 days 11:15:00', 'NaT']) + + tsneg = Timestamp('1950-01-01') + ts_neg_variants = [tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype('datetime64[ns]'), + tsneg.to_datetime64().astype('datetime64[D]')] + + tspos = Timestamp('1980-01-01') + ts_pos_variants = [tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype('datetime64[ns]'), + tspos.to_datetime64().astype('datetime64[D]')] + + for variant in ts_neg_variants + ts_pos_variants: + res = tdinat + variant + assert res[1] is pd.NaT From 6b2e0a47eab7cb8e78f6233fc76f64e0fe14c3a9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 11 Dec 2017 14:36:59 -0600 Subject: [PATCH 02/10] fixup! Masking and overflow checks for datetimeindex and timedeltaindex ops (#18020) --- doc/source/whatsnew/v0.21.1.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4a1a52082a83d..3388aebe2e46a 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -85,9 +85,6 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) -- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) -- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) Conversion ^^^^^^^^^^ @@ -162,8 +159,8 @@ Numeric ^^^^^^^ - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) -- -- +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Categorical From d8d01041ed9c64083a7c1dcad4cc924c1c41f9fa Mon Sep 17 00:00:00 2001 From: Krzysztof Chomski <32264853+kchomski-reef@users.noreply.github.com> Date: Sat, 9 Dec 2017 01:27:04 +0100 Subject: [PATCH 03/10] BUG: fillna maximum recursion depth exceeded in cmp (GH18159). (#18385) (cherry picked from commit 27a64b2bb9d631ef584a941a3a3f66aebc2477f5) --- doc/source/whatsnew/v0.21.1.txt | 3 +++ pandas/core/internals.py | 14 +++++++++----- pandas/tests/internals/test_internals.py | 2 ++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 3388aebe2e46a..59b4f6514d4fc 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -85,6 +85,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- + Conversion ^^^^^^^^^^ @@ -94,6 +96,7 @@ Conversion - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) - Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) - Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) +- Bug in :meth:`Series.fillna` which raised when passed a long integer on Python 2 (:issue:`18159`). - Indexing diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b929dfd5a9d0b..3b7cd1d02e1d3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1837,8 +1837,10 @@ def _can_hold_element(self, element): if tipo is not None: return (issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return (isinstance(element, (float, int, np.floating, np.int_)) and - not isinstance(element, (bool, np.bool_, datetime, timedelta, + return ( + isinstance( + element, (float, int, np.floating, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))) def to_native_types(self, slicer=None, na_rep='', float_format=None, @@ -1886,9 +1888,11 @@ def _can_hold_element(self, element): if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) - return (isinstance(element, - (float, int, complex, np.float_, np.int_)) and - not isinstance(element, (bool, np.bool_))) + return ( + isinstance( + element, + (float, int, complex, np.float_, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_))) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c182db35c0c89..4e59779cb9b47 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1245,7 +1245,9 @@ class TestCanHoldElement(object): @pytest.mark.parametrize('value, dtype', [ (1, 'i8'), (1.0, 'f8'), + (2**63, 'f8'), (1j, 'complex128'), + (2**63, 'complex128'), (True, 'bool'), (np.timedelta64(20, 'ns'), ' Date: Sun, 10 Dec 2017 15:06:37 +0100 Subject: [PATCH 04/10] DOC: clean-up whatsnew file for 0.21.1 (#18690) (cherry picked from commit 16de5f9e6c6ffda91be323c8cc4b6c0de628cdd3) --- doc/source/whatsnew/v0.21.1.txt | 64 +++++++++++++-------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 59b4f6514d4fc..6ee7303ae9fae 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -3,9 +3,23 @@ v0.21.1 ------- -This is a minor release from 0.21.1 and includes a number of deprecations, new -features, enhancements, and performance improvements along with a large number -of bug fixes. We recommend that all users upgrade to this version. +This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +Highlights include: + +- Temporarily restore matplotlib datetime plotting functionality. This should + resolve issues for users who relied implicitly on pandas to plot datetimes + with matplotlib. See :ref:`here `. +- Improvements to the Parquet IO functions introduced in 0.21.0. See + :ref:`here `. + + +.. contents:: What's new in v0.21.1 + :local: + :backlinks: none + .. _whatsnew_0211.special: @@ -42,9 +56,13 @@ registering them when they want them. New features ~~~~~~~~~~~~ -- -- -- +.. _whatsnew_0211.enhancements.parquet: + +Improvements to the Parquet IO functionality +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) +- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) .. _whatsnew_0211.enhancements.other: @@ -53,7 +71,6 @@ Other Enhancements - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) - :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). -- .. _whatsnew_0211.deprecations: @@ -69,17 +86,6 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of plotting large series/dataframes (:issue:`18236`). -- -- - -.. _whatsnew_0211.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- -- .. _whatsnew_0211.bug_fixes: @@ -97,7 +103,6 @@ Conversion - Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) - Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) - Bug in :meth:`Series.fillna` which raised when passed a long integer on Python 2 (:issue:`18159`). -- Indexing ^^^^^^^^ @@ -107,7 +112,6 @@ Indexing - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) - Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) - Bug in masked assignment of a ``timedelta64[ns]`` dtype ``Series``, incorrectly coerced to float (:issue:`18493`) -- I/O ^^^ @@ -117,8 +121,6 @@ I/O - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) -- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) -- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) @@ -130,8 +132,6 @@ Plotting ^^^^^^^^ - Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) -- -- Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -140,15 +140,6 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) - Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) -- -- - -Sparse -^^^^^^ - -- -- -- Reshaping ^^^^^^^^^ @@ -164,7 +155,6 @@ Numeric - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) -- Categorical ^^^^^^^^^^^ @@ -180,9 +170,3 @@ String ^^^^^^ - :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) - -Other -^^^^^ - -- -- From 13f8ffa48a65c24d349afc8d89c6b7f2e4c9aaf4 Mon Sep 17 00:00:00 2001 From: Sven Date: Mon, 11 Dec 2017 05:28:05 +1100 Subject: [PATCH 05/10] BUG: Categorical data fails to load from hdf when all columns are NaN (#18652) (cherry picked from commit 2db1cc098d939b7d7b41e18ad8525d18ad20c1ad) --- .gitignore | 1 + doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/pytables.py | 15 +++++++++++---- pandas/tests/io/test_pytables.py | 19 +++++++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index ff0a6aef47163..b1748ae72b8ba 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ doc/build/html/index.html doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ +env/ diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 6ee7303ae9fae..8dc649f2f2a67 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -126,6 +126,7 @@ I/O - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) - Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) +- Bug when reading NaN-only categorical columns in :class:`HDFStore` (:issue:`18413`) Plotting diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 40955c50f6b5f..2a1aaf2f66469 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2137,10 +2137,17 @@ def convert(self, values, nan_rep, encoding): # if we have stored a NaN in the categories # then strip it; in theory we could have BOTH # -1s in the codes and nulls :< - mask = isna(categories) - if mask.any(): - categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = Index([], dtype=np.float64) + else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values self.data = Categorical.from_codes(codes, categories=categories, diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index a97747b93369f..a7cc6b711802e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4928,6 +4928,25 @@ def test_categorical_conversion(self): result = read_hdf(path, 'df', where='obsids=B') tm.assert_frame_equal(result, expected) + def test_categorical_nan_only_columns(self): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = pd.DataFrame({ + 'a': ['a', 'b', 'c', np.nan], + 'b': [np.nan, np.nan, np.nan, np.nan], + 'c': [1, 2, 3, 4], + 'd': pd.Series([None] * 4, dtype=object) + }) + df['a'] = df.a.astype('category') + df['b'] = df.b.astype('category') + df['d'] = df.b.astype('category') + expected = df + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df') + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) From 3cbd1cc874076b7f98000d1aad7ac9815006fe82 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Dec 2017 23:50:23 +0100 Subject: [PATCH 06/10] Parquet: Add error message for no engine found (#18717) * patquet.py: Add error message for no engine found Give a better error message for engine="auto" case when none of the engines were found installed. * clean-up to prevent merge conflicts with other PR (cherry picked from commit b8b108c0c1e76025525d42fe382345ac3e72be62) --- pandas/io/parquet.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4a13d2c9db944..179998957637f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -25,6 +25,11 @@ def get_engine(engine): except ImportError: pass + raise ImportError("Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "pyarrow or fastparquet is required for parquet " + "support") + if engine not in ['pyarrow', 'fastparquet']: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") From 620048bc3ccfc0e43997ae4fd1810025460d0c82 Mon Sep 17 00:00:00 2001 From: Hans Date: Mon, 11 Dec 2017 12:08:38 +0100 Subject: [PATCH 07/10] BUG: Fix to_latex with longtable (#17959) (#17960) closes #17959 (cherry picked from commit e909ea0b2a583bcc9cfe3e759652351d7f0266cb) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/formats/format.py | 4 +-- pandas/tests/io/formats/test_to_latex.py | 35 ++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 8dc649f2f2a67..8651c9784daca 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -127,6 +127,7 @@ I/O - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) - Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) - Bug when reading NaN-only categorical columns in :class:`HDFStore` (:issue:`18413`) +- Bug in :meth:`DataFrame.to_latex` with ``longtable=True`` where a latex multicolumn always spanned over three columns (:issue:`17959`) Plotting diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 24eeb1dd94c18..bac5ac762400d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -950,8 +950,8 @@ def get_col_type(dtype): if self.longtable: buf.write('\\endhead\n') buf.write('\\midrule\n') - buf.write('\\multicolumn{3}{r}{{Continued on next ' - 'page}} \\\\\n') + buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' + 'page}}}} \\\\\n'.format(n=len(row))) buf.write('\\midrule\n') buf.write('\\endfoot\n\n') buf.write('\\bottomrule\n') diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index c0b7d4cee384a..5504ac942f688 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -91,6 +91,29 @@ def test_to_latex_format(self, frame): assert withindex_result == withindex_expected + def test_to_latex_empty(self): + df = DataFrame() + result = df.to_latex() + expected = r"""\begin{tabular}{l} +\toprule +Empty DataFrame +Columns: Index([], dtype='object') +Index: Index([], dtype='object') \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + result = df.to_latex(longtable=True) + expected = r"""\begin{longtable}{l} +\toprule +Empty DataFrame +Columns: Index([], dtype='object') +Index: Index([], dtype='object') \\ +\end{longtable} +""" + assert result == expected + def test_to_latex_with_formatters(self): df = DataFrame({'int': [1, 2, 3], 'float': [1.0, 2.0, 3.0], @@ -377,7 +400,7 @@ def test_to_latex_longtable(self, frame): 1 & 2 & b2 \\ \end{longtable} """ - + open("expected.txt", "w").write(withindex_result) assert withindex_result == withindex_expected withoutindex_result = df.to_latex(index=False, longtable=True) @@ -387,7 +410,7 @@ def test_to_latex_longtable(self, frame): \midrule \endhead \midrule -\multicolumn{3}{r}{{Continued on next page}} \\ +\multicolumn{2}{r}{{Continued on next page}} \\ \midrule \endfoot @@ -400,6 +423,14 @@ def test_to_latex_longtable(self, frame): assert withoutindex_result == withoutindex_expected + df = DataFrame({'a': [1, 2]}) + with1column_result = df.to_latex(index=False, longtable=True) + assert "\multicolumn{1}" in with1column_result + + df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + with3columns_result = df.to_latex(index=False, longtable=True) + assert "\multicolumn{3}" in with3columns_result + def test_to_latex_escape_special_chars(self): special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^', '\\'] From 9c7e90e63fa9b41d3244d06852d16f8d017c03a2 Mon Sep 17 00:00:00 2001 From: Dave Hirschfeld Date: Tue, 12 Dec 2017 04:57:16 +1000 Subject: [PATCH 08/10] ENH: support non default indexes in writing to Parquet (#18629) fastparquet automatically names an index 'index' if it doesn't already have a name (cherry picked from commit 8d7e8766df87a97b0f4436b6bd84b9782db1a07c) --- doc/source/io.rst | 5 +- doc/source/whatsnew/v0.21.1.txt | 5 +- pandas/io/parquet.py | 185 ++++++++++++++++++++------------ pandas/tests/io/test_parquet.py | 106 ++++++++++++------ 4 files changed, 189 insertions(+), 112 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4024414610a82..ba33c449e701f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4548,11 +4548,8 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the - ``DataFrame`` and will raise an error if a non-default one is provided. You - can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to - ignore it. - Duplicate column names and non-string columns names are not supported +- Index level names, if specified, must be strings - Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message on an attempt at serialization. diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 8651c9784daca..206dabd1142ae 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -61,6 +61,9 @@ New features Improvements to the Parquet IO functionality ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- :func:`DataFrame.to_parquet` will now write non-default indexes when the + underlying engine supports it. The indexes will be preserved when reading + back in with :func:`read_parquet` (:issue:`18581`). - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) @@ -91,8 +94,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- - Conversion ^^^^^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 179998957637f..eaaa14e756e22 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,8 @@ from warnings import catch_warnings from distutils.version import LooseVersion from pandas import DataFrame, RangeIndex, Int64Index, get_option -from pandas.compat import range +from pandas.compat import string_types +from pandas.core.common import AbstractMethodError from pandas.io.common import get_filepath_or_buffer @@ -39,37 +40,75 @@ def get_engine(engine): return FastParquetImpl() -class PyArrowImpl(object): +class BaseImpl(object): + + api = None # module + + @staticmethod + def validate_dataframe(df): + + if not isinstance(df, DataFrame): + raise ValueError("to_parquet only supports IO with DataFrames") + + # must have value column names (strings only) + if df.columns.inferred_type not in {'string', 'unicode'}: + raise ValueError("parquet must have string column names") + + # index level names must be strings + valid_names = all( + isinstance(name, string_types) + for name in df.index.names + if name is not None + ) + if not valid_names: + raise ValueError("Index level names must be strings") + + def write(self, df, path, compression, **kwargs): + raise AbstractMethodError(self) + + def read(self, path, columns=None, **kwargs): + raise AbstractMethodError(self) + + +class PyArrowImpl(BaseImpl): def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use - try: import pyarrow import pyarrow.parquet except ImportError: - raise ImportError("pyarrow is required for parquet support\n\n" - "you can install via conda\n" - "conda install pyarrow -c conda-forge\n" - "\nor via pip\n" - "pip install -U pyarrow\n") - + raise ImportError( + "pyarrow is required for parquet support\n\n" + "you can install via conda\n" + "conda install pyarrow -c conda-forge\n" + "\nor via pip\n" + "pip install -U pyarrow\n" + ) if LooseVersion(pyarrow.__version__) < '0.4.1': - raise ImportError("pyarrow >= 0.4.1 is required for parquet" - "support\n\n" - "you can install via conda\n" - "conda install pyarrow -c conda-forge\n" - "\nor via pip\n" - "pip install -U pyarrow\n") - - self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0' - self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0' + raise ImportError( + "pyarrow >= 0.4.1 is required for parquet support\n\n" + "you can install via conda\n" + "conda install pyarrow -c conda-forge\n" + "\nor via pip\n" + "pip install -U pyarrow\n" + ) + + self._pyarrow_lt_060 = ( + LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0')) + self._pyarrow_lt_070 = ( + LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0')) + self.api = pyarrow def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): + self.validate_dataframe(df) + if self._pyarrow_lt_070: + self._validate_write_lt_070(df) path, _, _ = get_filepath_or_buffer(path) + if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) self.api.parquet.write_table( @@ -83,36 +122,75 @@ def write(self, df, path, compression='snappy', def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) + if self._pyarrow_lt_070: + return self.api.parquet.read_pandas(path, columns=columns, + **kwargs).to_pandas() + kwargs['use_pandas_metadata'] = True return self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas() - -class FastParquetImpl(object): + def _validate_write_lt_070(self, df): + # Compatibility shim for pyarrow < 0.7.0 + # TODO: Remove in pandas 0.22.0 + from pandas.core.indexes.multi import MultiIndex + if isinstance(df.index, MultiIndex): + msg = ( + "Multi-index DataFrames are only supported " + "with pyarrow >= 0.7.0" + ) + raise ValueError(msg) + # Validate index + if not isinstance(df.index, Int64Index): + msg = ( + "pyarrow < 0.7.0 does not support serializing {} for the " + "index; you can .reset_index() to make the index into " + "column(s), or install the latest version of pyarrow or " + "fastparquet." + ) + raise ValueError(msg.format(type(df.index))) + if not df.index.equals(RangeIndex(len(df))): + raise ValueError( + "pyarrow < 0.7.0 does not support serializing a non-default " + "index; you can .reset_index() to make the index into " + "column(s), or install the latest version of pyarrow or " + "fastparquet." + ) + if df.index.name is not None: + raise ValueError( + "pyarrow < 0.7.0 does not serialize indexes with a name; you " + "can set the index.name to None or install the latest version " + "of pyarrow or fastparquet." + ) + + +class FastParquetImpl(BaseImpl): def __init__(self): # since pandas is a dependency of fastparquet # we need to import on first use - try: import fastparquet except ImportError: - raise ImportError("fastparquet is required for parquet support\n\n" - "you can install via conda\n" - "conda install fastparquet -c conda-forge\n" - "\nor via pip\n" - "pip install -U fastparquet") - + raise ImportError( + "fastparquet is required for parquet support\n\n" + "you can install via conda\n" + "conda install fastparquet -c conda-forge\n" + "\nor via pip\n" + "pip install -U fastparquet" + ) if LooseVersion(fastparquet.__version__) < '0.1.0': - raise ImportError("fastparquet >= 0.1.0 is required for parquet " - "support\n\n" - "you can install via conda\n" - "conda install fastparquet -c conda-forge\n" - "\nor via pip\n" - "pip install -U fastparquet") - + raise ImportError( + "fastparquet >= 0.1.0 is required for parquet " + "support\n\n" + "you can install via conda\n" + "conda install fastparquet -c conda-forge\n" + "\nor via pip\n" + "pip install -U fastparquet" + ) self.api = fastparquet def write(self, df, path, compression='snappy', **kwargs): + self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. @@ -123,7 +201,8 @@ def write(self, df, path, compression='snappy', **kwargs): def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs) + parquet_file = self.api.ParquetFile(path) + return parquet_file.to_pandas(columns=columns, **kwargs) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -144,43 +223,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): kwargs Additional keyword arguments passed to the engine """ - impl = get_engine(engine) - - if not isinstance(df, DataFrame): - raise ValueError("to_parquet only support IO with DataFrames") - - valid_types = {'string', 'unicode'} - - # validate index - # -------------- - - # validate that we have only a default index - # raise on anything else as we don't serialize the index - - if not isinstance(df.index, Int64Index): - raise ValueError("parquet does not support serializing {} " - "for the index; you can .reset_index()" - "to make the index into column(s)".format( - type(df.index))) - - if not df.index.equals(RangeIndex.from_range(range(len(df)))): - raise ValueError("parquet does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)") - - if df.index.name is not None: - raise ValueError("parquet does not serialize index meta-data on a " - "default index") - - # validate columns - # ---------------- - - # must have value column names (strings only) - if df.columns.inferred_type not in valid_types: - raise ValueError("parquet must have string column names") - return impl.write(df, path, compression=compression, **kwargs) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e7bcff22371b7..8c88cf076319b 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -181,15 +181,14 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): class Base(object): def check_error_on_write(self, df, engine, exc): - # check that we are raising the exception - # on writing - + # check that we are raising the exception on writing with pytest.raises(exc): with tm.ensure_clean() as path: to_parquet(df, path, engine, compression=None) def check_round_trip(self, df, engine, expected=None, - write_kwargs=None, read_kwargs=None): + write_kwargs=None, read_kwargs=None, + check_names=True): if write_kwargs is None: write_kwargs = {} if read_kwargs is None: @@ -200,7 +199,7 @@ def check_round_trip(self, df, engine, expected=None, if expected is None: expected = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=check_names) # repeat to_parquet(df, path, engine, **write_kwargs) @@ -208,7 +207,7 @@ def check_round_trip(self, df, engine, expected=None, if expected is None: expected = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=check_names) class TestBasic(Base): @@ -247,33 +246,6 @@ def test_columns_dtypes_invalid(self, engine): datetime.datetime(2011, 1, 1, 1, 1)] self.check_error_on_write(df, engine, ValueError) - def test_write_with_index(self, engine): - - df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, write_kwargs={'compression': None}) - - # non-default index - for index in [[2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), - [1, 3, 4], - pd.MultiIndex.from_tuples([('a', 1), ('a', 2), - ('b', 1)]), - ]: - - df.index = index - self.check_error_on_write(df, engine, ValueError) - - # index with meta-data - df.index = [0, 1, 2] - df.index.name = 'foo' - self.check_error_on_write(df, engine, ValueError) - - # column multi-index - df.index = [0, 1, 2] - df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), - self.check_error_on_write(df, engine, ValueError) - @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli']) def test_compression(self, engine, compression): @@ -297,6 +269,72 @@ def test_read_columns(self, engine): write_kwargs={'compression': None}, read_kwargs={'columns': ['string']}) + def test_write_index(self, engine): + check_names = engine != 'fastparquet' + + if engine == 'pyarrow': + import pyarrow + if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'): + pytest.skip("pyarrow is < 0.7.0") + + df = pd.DataFrame({'A': [1, 2, 3]}) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) + + indexes = [ + [2, 3, 4], + pd.date_range('20130101', periods=3), + list('abc'), + [1, 3, 4], + ] + # non-default index + for index in indexes: + df.index = index + self.check_round_trip( + df, engine, + write_kwargs={'compression': None}, + check_names=check_names) + + # index with meta-data + df.index = [0, 1, 2] + df.index.name = 'foo' + self.check_round_trip(df, engine, write_kwargs={'compression': None}) + + def test_write_multiindex(self, pa_ge_070): + # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version + engine = pa_ge_070 + + df = pd.DataFrame({'A': [1, 2, 3]}) + index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df.index = index + self.check_round_trip(df, engine, write_kwargs={'compression': None}) + + def test_write_column_multiindex(self, engine): + # column multi-index + mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) + self.check_error_on_write(df, engine, ValueError) + + def test_multiindex_with_columns(self, pa_ge_070): + + engine = pa_ge_070 + dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') + df = pd.DataFrame(np.random.randn(2 * len(dates), 3), + columns=list('ABC')) + index1 = pd.MultiIndex.from_product( + [['Level1', 'Level2'], dates], + names=['level', 'date']) + index2 = index1.copy(names=None) + for index in [index1, index2]: + df.index = index + with tm.ensure_clean() as path: + df.to_parquet(path, engine) + result = read_parquet(path, engine) + expected = df + tm.assert_frame_equal(result, expected) + result = read_parquet(path, engine, columns=['A', 'B']) + expected = df[['A', 'B']] + tm.assert_frame_equal(result, expected) + class TestParquetPyArrow(Base): @@ -322,14 +360,12 @@ def test_basic(self, pa): self.check_round_trip(df, pa) def test_duplicate_columns(self, pa): - # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('aaa')).copy() self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) self.check_error_on_write(df, pa, ValueError) From 5ee4b912dcfb6763e8a0ebd2caa90554b5f708f1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Dec 2017 22:03:50 +0100 Subject: [PATCH 09/10] DOC: fix options table (#18730) * DOC: fix options table * additional fix (cherry picked from commit 8e34ec8cb92f14667f8e85c21ffaa02ab093f22a) --- doc/source/computation.rst | 2 +- doc/source/options.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 0cdfec63fd696..cd3cc282a8010 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -348,7 +348,7 @@ The following methods are available: The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are the `scipy.signal window functions - `__: +`__: - ``boxcar`` - ``triang`` diff --git a/doc/source/options.rst b/doc/source/options.rst index db3380bd4a3e7..505a5ade68de0 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -433,7 +433,7 @@ compute.use_numexpr True Use the numexpr library to computation if it is installed. plotting.matplotlib.register_converters True Register custom converters with matplotlib. Set to False to de-register. -======================================= ============ ======================================== +======================================= ============ ================================== .. _basics.console_output: From 37f954a43a01cc1c7f48e2e4f10fafe5d1996967 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 11 Dec 2017 15:05:44 -0600 Subject: [PATCH 10/10] CFG: Ignore W503 Partial backport of https://github.com/pandas-dev/pandas/pull/18046/files --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 0123078523b6f..7a88ee8557dc7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ tag_prefix = v parentdir_prefix = pandas- [flake8] -ignore = E731,E402 +ignore = E731,E402,W503 max-line-length = 79 [yapf]