Skip to content

BUG/REG: file-handle object handled incorrectly in to_csv #21478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jun 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version.
Fixed Regressions
~~~~~~~~~~~~~~~~~

-
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
-

.. _whatsnew_0232.performance:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3790,7 +3790,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
non-ascii, for python versions prior to 3
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
date_format: string, default None
Format string for datetime objects.
decimal: string, default '.'
Expand Down
4 changes: 4 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)

@property
def closed(self):
return self.fp is None


class MMapWrapper(BaseIterator):
"""
Expand Down
59 changes: 39 additions & 20 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from __future__ import print_function

import warnings

import csv as csvlib
from zipfile import ZipFile
import numpy as np

from pandas.core.dtypes.missing import notna
from pandas.core.dtypes.inference import is_file_like
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)
Expand Down Expand Up @@ -128,19 +130,31 @@ def save(self):
else:
encoding = self.encoding

# PR 21300 uses string buffer to receive csv writing and dump into
# file-like output with compression as option. GH 21241, 21118
f = StringIO()
if not is_file_like(self.path_or_buf):
# path_or_buf is path
path_or_buf = self.path_or_buf
elif hasattr(self.path_or_buf, 'name'):
# path_or_buf is file handle
path_or_buf = self.path_or_buf.name
else:
# path_or_buf is file-like IO objects.
# GH 21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
msg = ("compression has no effect when passing file-like "
"object as input.")
warnings.warn(msg, RuntimeWarning, stacklevel=2)

# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, 'write')
and self.compression == 'zip')

if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH 21241, 21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
path_or_buf = None
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True

try:
writer_kwargs = dict(lineterminator=self.line_terminator,
Expand All @@ -157,13 +171,18 @@ def save(self):
self._save()

finally:
# GH 17778 handles zip compression for byte strings separately.
buf = f.getvalue()
if path_or_buf:
f, handles = _get_handle(path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
if is_zip:
# GH 17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, 'write'):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np

from pandas.compat import (lmap, range, lrange, StringIO, u)
from pandas.io.common import _get_handle
import pandas.core.common as com
from pandas.errors import ParserError
from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
Expand Down Expand Up @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression):
with ensure_clean() as filename:

df.to_csv(filename, compression=compression, encoding=encoding)

# test the round trip - to_csv -> read_csv
result = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
assert_frame_equal(df, result)

with open(filename, 'w') as fh:
df.to_csv(fh, compression=compression, encoding=encoding)

result_fh = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
df.to_csv(f, encoding=encoding)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_frame_equal(df, result)
assert_frame_equal(df, result_fh)

# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
18 changes: 9 additions & 9 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas import Series, DataFrame

from pandas.compat import StringIO, u
from pandas.io.common import _get_handle
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, ensure_clean)
import pandas.util.testing as tm
Expand Down Expand Up @@ -151,20 +152,19 @@ def test_to_csv_compression(self, s, encoding, compression):

s.to_csv(filename, compression=compression, encoding=encoding,
header=True)

# test the round trip - to_csv -> read_csv
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)

with open(filename, 'w') as fh:
s.to_csv(fh, compression=compression, encoding=encoding,
header=True)

result_fh = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0,
squeeze=True)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
s.to_csv(f, encoding=encoding, header=True)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)
assert_series_equal(s, result_fh)

# explicitly ensure file was compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
34 changes: 25 additions & 9 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas.compat import range, lmap
import pandas.core.common as com
from pandas.core import ops
from pandas.io.common import _get_handle
import pandas.util.testing as tm


Expand Down Expand Up @@ -246,19 +247,34 @@ def test_compression_size(obj, method, compression_only):
[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z']),
Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
@pytest.mark.parametrize('method', ['to_csv'])
@pytest.mark.parametrize('method', ['to_csv', 'to_json'])
def test_compression_size_fh(obj, method, compression_only):

with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=compression_only)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with f:
getattr(obj, method)(f)
assert not f.closed
assert f.closed
compressed = os.path.getsize(filename)
with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=None)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=None)
with f:
getattr(obj, method)(f)
assert not f.closed
assert f.closed
uncompressed = os.path.getsize(filename)
assert uncompressed > compressed


# GH 21227
def test_compression_warning(compression_only):
df = DataFrame(100 * [[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z'])
with tm.ensure_clean() as filename:
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with tm.assert_produces_warning(RuntimeWarning,
check_stacklevel=False):
with f:
df.to_csv(f, compression=compression_only)