Skip to content

BUG: #17778 - DataFrame.to_pickle() fails for .zip format on MacOS and pandas 0.20.3 #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ Documentation Changes
Bug Fixes
~~~~~~~~~

- Bug in ``DataFrame.to_pickle()`` fails for .zip format (:issue:`17778`)

Conversion
^^^^^^^^^^
Expand Down
23 changes: 13 additions & 10 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,17 +357,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
# ZIP Compression
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path_or_buf)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
if mode == 'wb':
f = zipfile.ZipFile(path_or_buf, 'w')
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))
zip_file = zipfile.ZipFile(path_or_buf)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))

# XZ Compression
elif compression == 'xz':
Expand Down
15 changes: 13 additions & 2 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
""" pickle compat """

import os
import tempfile
import zipfile

import numpy as np
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
Expand All @@ -16,7 +20,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
obj : any object
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
a string representing the compression to use in the output file

.. versionadded:: 0.20.0
Expand All @@ -42,7 +46,14 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
try:
pkl.dump(obj, f, protocol=protocol)
if isinstance(f, zipfile.ZipFile):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really don't like it - I don't expect that _get_handle returns something other than file-like. Creating temp file is unacceptable - some buffered solution can be introduced.Unfortunately support for buffered zip writing is in Python 3.6, but we can backport it and place in: https://github.com/pandas-dev/pandas/blob/master/pandas/compat/

Python 3.6 supports mode='w' in Zipfile.open (https://github.com/python/cpython/blob/master/Lib/zipfile.py#L1312) so this whole commit for Python 3.6 could look like:

if mode == 'wb':
    f = zipfile.open('data.bin', mode='w')
else:
    # find filename and open for read

It should be really easy to backport for Python 3,x - backporting for Python 2.7 can be trickier

tmp_file = tempfile.NamedTemporaryFile(delete=False)
pkl.dump(obj, tmp_file, protocol=protocol)
tmp_file.close()
f.write(tmp_file.name)
os.remove(tmp_file.name)
else:
pkl.dump(obj, f, protocol=protocol)
finally:
for _f in fh:
_f.close()
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def decompress_file(self, src_path, dest_path, compression):
fh.write(f.read())
f.close()

@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz'])
@pytest.mark.parametrize('compression', [None, 'gzip', 'zip', 'bz2', 'xz'])
def test_write_explicit(self, compression, get_random_path):
# issue 11666
if compression == 'xz':
Expand Down Expand Up @@ -414,7 +414,8 @@ def test_write_explicit_bad(self, compression, get_random_path):
df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)

@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress'])
@pytest.mark.parametrize('ext', ['', '.gz', '.zip', '.bz2', '.xz',
'.no_compress'])
def test_write_infer(self, ext, get_random_path):
if ext == '.xz':
tm._skip_if_no_lzma()
Expand Down Expand Up @@ -442,7 +443,7 @@ def test_write_infer(self, ext, get_random_path):

tm.assert_frame_equal(df, df2)

@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"])
@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', 'zip'])
def test_read_explicit(self, compression, get_random_path):
# issue 11666
if compression == 'xz':
Expand Down