diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 16e6c12488b83..83a94bcbd9c79 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -493,6 +493,7 @@ I/O - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) - Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) Plotting diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b87e46f9b6648..4b7a47c5f93c2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,15 +1,10 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license -import bz2 from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC from errno import ENOENT -import gzip -import io -import os import sys import time import warnings -import zipfile from libc.stdlib cimport free from libc.string cimport strcasecmp, strlen, strncpy @@ -17,7 +12,7 @@ from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t -from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString +from cpython.bytes cimport PyBytes_AsString from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF @@ -67,7 +62,6 @@ from pandas._libs.khash cimport ( khiter_t, ) -from pandas.compat import get_lzma_file, import_lzma from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( @@ -82,11 +76,10 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals -lzma = import_lzma() - cdef: float64_t INF = np.inf float64_t NEGINF = -INF + int64_t DEFAULT_CHUNKSIZE = 256 * 1024 cdef extern from "headers/portable.h": @@ -275,14 +268,15 @@ cdef extern from "parser/io.h": size_t *bytes_read, int *status) -DEFAULT_CHUNKSIZE = 256 * 1024 - - cdef class TextReader: """ # source: StringIO or file object + ..versionchange:: 1.2.0 + removed 'compression', 'memory_map', and 'encoding' argument. + These arguments are outsourced to CParserWrapper. + 'source' has to be a file handle. """ cdef: @@ -299,7 +293,7 @@ cdef class TextReader: cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory + bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace object delimiter, converters object na_values @@ -307,8 +301,6 @@ cdef class TextReader: object index_col object skiprows object dtype - object encoding - object compression object usecols list dtype_cast_order set unnamed_cols @@ -321,10 +313,8 @@ cdef class TextReader: header_end=0, index_col=None, names=None, - bint memory_map=False, tokenize_chunksize=DEFAULT_CHUNKSIZE, bint delim_whitespace=False, - compression=None, converters=None, bint skipinitialspace=False, escapechar=None, @@ -332,7 +322,6 @@ cdef class TextReader: quotechar=b'"', quoting=0, lineterminator=None, - encoding=None, comment=None, decimal=b'.', thousands=None, @@ -356,15 +345,7 @@ cdef class TextReader: bint skip_blank_lines=True): # set encoding for native Python and C library - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding + self.c_encoding = NULL self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -374,9 +355,6 @@ cdef class TextReader: # For timekeeping self.clocks = [] - self.compression = compression - self.memory_map = memory_map - self.parser.usecols = (usecols is not None) self._setup_parser_source(source) @@ -562,11 +540,6 @@ cdef class TextReader: parser_del(self.parser) def close(self): - # we need to properly close an open derived - # filehandle here, e.g. and UTFRecoder - if self.handle is not None: - self.handle.close() - # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -614,82 +587,15 @@ cdef class TextReader: cdef: void *ptr - self.parser.cb_io = NULL - self.parser.cb_cleanup = NULL - - if self.compression: - if self.compression == 'gzip': - if isinstance(source, str): - source = gzip.GzipFile(source, 'rb') - else: - source = gzip.GzipFile(fileobj=source) - elif self.compression == 'bz2': - source = bz2.BZ2File(source, 'rb') - elif self.compression == 'zip': - zip_file = zipfile.ZipFile(source) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - source = zip_file.open(file_name) - - elif len(zip_names) == 0: - raise ValueError(f'Zero files found in compressed ' - f'zip file {source}') - else: - raise ValueError(f'Multiple files found in compressed ' - f'zip file {zip_names}') - elif self.compression == 'xz': - if isinstance(source, str): - source = get_lzma_file(lzma)(source, 'rb') - else: - source = get_lzma_file(lzma)(filename=source) - else: - raise ValueError(f'Unrecognized compression type: ' - f'{self.compression}') - - if (self.encoding and hasattr(source, "read") and - not hasattr(source, "encoding")): - source = io.TextIOWrapper( - source, self.encoding.decode('utf-8'), newline='') - - self.encoding = b'utf-8' - self.c_encoding = self.encoding - - self.handle = source - - if isinstance(source, str): - encoding = sys.getfilesystemencoding() or "utf-8" - usource = source - source = source.encode(encoding) - - if self.memory_map: - ptr = new_mmap(source) - if ptr == NULL: - # fall back - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - else: - self.parser.cb_io = &buffer_mmap_bytes - self.parser.cb_cleanup = &del_mmap - else: - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - self.parser.source = ptr - - elif hasattr(source, 'read'): - # e.g., StringIO - - ptr = new_rd_source(source) - self.parser.source = ptr - self.parser.cb_io = &buffer_rd_bytes - self.parser.cb_cleanup = &del_rd_source - else: + if not hasattr(source, "read"): raise IOError(f'Expected file path name or file-like object, ' f'got {type(source)} type') + ptr = new_rd_source(source) + self.parser.source = ptr + self.parser.cb_io = &buffer_rd_bytes + self.parser.cb_cleanup = &del_rd_source + cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] diff --git a/pandas/_typing.py b/pandas/_typing.py index 3376559fb23ff..3e89cf24632e2 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,6 @@ -from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo -from io import IOBase +from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper +from mmap import mmap from pathlib import Path from typing import ( IO, @@ -10,7 +10,6 @@ Callable, Collection, Dict, - Generic, Hashable, List, Mapping, @@ -77,8 +76,6 @@ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] -FileOrBuffer = Union[str, IO[AnyStr], IOBase] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -133,6 +130,10 @@ "Resampler", ] +# filenames and file-like-objects +Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] +FileOrBuffer = Union[str, Buffer[T]] +FilePathOrBuffer = Union[Path, FileOrBuffer[T]] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] @@ -150,21 +151,3 @@ # type of float formatter in DataFrameFormatter FloatFormatType = Union[str, Callable, "EngFormatter"] - - -@dataclass -class IOargs(Generic[ModeVar, EncodingVar]): - """ - Return value of io/common.py:get_filepath_or_buffer. - - Note (copy&past from io/parsers): - filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - though mypy handling of conditional imports is difficult. - See https://github.com/python/mypy/issues/1297 - """ - - filepath_or_buffer: FileOrBuffer - encoding: EncodingVar - compression: CompressionDict - should_close: bool - mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24b89085ac121..a3130ec27713d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,6 +15,7 @@ import datetime from io import StringIO import itertools +import mmap from textwrap import dedent from typing import ( IO, @@ -2286,10 +2287,9 @@ def to_markdown( if buf is None: return result ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) - assert not isinstance(ioargs.filepath_or_buffer, str) + assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap)) ioargs.filepath_or_buffer.writelines(result) - if ioargs.should_close: - ioargs.filepath_or_buffer.close() + ioargs.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..90a79e54015c4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,8 +2,9 @@ import bz2 from collections import abc +import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase +from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper import mmap import os import pathlib @@ -13,12 +14,14 @@ Any, AnyStr, Dict, + Generic, List, Mapping, Optional, Tuple, Type, Union, + cast, ) from urllib.parse import ( urljoin, @@ -31,12 +34,12 @@ import zipfile from pandas._typing import ( + Buffer, CompressionDict, CompressionOptions, EncodingVar, FileOrBuffer, FilePathOrBuffer, - IOargs, ModeVar, StorageOptions, ) @@ -56,6 +59,76 @@ from io import IOBase +@dataclasses.dataclass +class IOArgs(Generic[ModeVar, EncodingVar]): + """ + Return value of io/common.py:get_filepath_or_buffer. + + This is used to easily close created fsspec objects. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FileOrBuffer + encoding: EncodingVar + mode: Union[ModeVar, str] + compression: CompressionDict + should_close: bool = False + + def close(self) -> None: + """ + Close the buffer if it was created by get_filepath_or_buffer. + """ + if self.should_close: + assert not isinstance(self.filepath_or_buffer, str) + try: + self.filepath_or_buffer.close() + except (OSError, ValueError): + pass + self.should_close = False + + +@dataclasses.dataclass +class IOHandles: + """ + Return value of io/common.py:get_handle + + This is used to easily close created buffers and to handle corner cases when + TextIOWrapper is inserted. + + handle: The file handle to be used. + created_handles: All file handles that are created by get_handle + is_wrapped: Whether a TextIOWrapper needs to be detached. + """ + + handle: Buffer + created_handles: List[Buffer] = dataclasses.field(default_factory=list) + is_wrapped: bool = False + + def close(self) -> None: + """ + Close all created buffers. + + Note: If a TextIOWrapper was inserted, it is flushed and detached to + avoid closing the potentially user-created buffer. + """ + if self.is_wrapped: + assert isinstance(self.handle, TextIOWrapper) + self.handle.flush() + self.handle.detach() + self.created_handles.remove(self.handle) + try: + for handle in self.created_handles: + handle.close() + except (OSError, ValueError): + pass + self.created_handles = [] + self.is_wrapped = False + + def is_url(url) -> bool: """ Check to see if a URL has a valid protocol. @@ -176,7 +249,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, -) -> IOargs[ModeVar, EncodingVar]: +) -> IOArgs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -201,7 +274,7 @@ def get_filepath_or_buffer( ..versionchange:: 1.2.0 - Returns the dataclass IOargs. + Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -225,6 +298,10 @@ def get_filepath_or_buffer( compression = dict(compression, method=compression_method) + # uniform encoding names + if encoding is not None: + encoding = encoding.replace("_", "-").lower() + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( @@ -258,7 +335,7 @@ def get_filepath_or_buffer( compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() - return IOargs( + return IOArgs( filepath_or_buffer=reader, encoding=encoding, compression=compression, @@ -310,7 +387,7 @@ def get_filepath_or_buffer( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() - return IOargs( + return IOArgs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, @@ -323,7 +400,7 @@ def get_filepath_or_buffer( ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return IOargs( + return IOArgs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, @@ -335,7 +412,7 @@ def get_filepath_or_buffer( msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return IOargs( + return IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, @@ -455,14 +532,14 @@ def infer_compression( def get_handle( - path_or_buf, + path_or_buf: FilePathOrBuffer, mode: str, - encoding=None, + encoding: Optional[str] = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, - errors=None, -): + errors: Optional[str] = None, +) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -506,14 +583,9 @@ def get_handle( See the errors argument for :func:`open` for a full list of options. - .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 - Returns - ------- - f : file-like - A file-like object. - handles : list of file-like objects - A list of file-like object that were opened in this function. + Returns the dataclass IOHandles """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: @@ -532,12 +604,16 @@ def get_handle( except ImportError: pass - handles: List[Union[IO, _MMapWrapper]] = list() - f = path_or_buf + handles: List[Buffer] = list() + + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior + if encoding is None: + encoding = "utf-8" # Convert pathlib.Path/py.path.local or string path_or_buf = stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + f = path_or_buf compression, compression_args = get_compression_method(compression) if is_path: @@ -548,25 +624,29 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: + assert isinstance(path_or_buf, str) f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) + f = gzip.GzipFile( + fileobj=path_or_buf, # type: ignore[arg-type] + mode=mode, + **compression_args, + ) # BZ Compression elif compression == "bz2": - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + f = bz2.BZ2File( + path_or_buf, mode=mode, **compression_args # type: ignore[arg-type] + ) # ZIP Compression elif compression == "zip": - zf = _BytesZipFile(path_or_buf, mode, **compression_args) - # Ensure the container is closed as well. - handles.append(zf) - if zf.mode == "w": - f = zf - elif zf.mode == "r": - zip_names = zf.namelist() + f = _BytesZipFile(path_or_buf, mode, **compression_args) + if f.mode == "r": + handles.append(f) + zip_names = f.namelist() if len(zip_names) == 1: - f = zf.open(zip_names.pop()) + f = f.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: @@ -584,36 +664,40 @@ def get_handle( msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) + assert not isinstance(f, str) handles.append(f) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. is_binary_mode = "b" in mode - + assert isinstance(path_or_buf, str) if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text and not is_binary_mode: - # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression or isinstance(f, need_text_wrapping)): - from io import TextIOWrapper - - g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") - if not isinstance(f, (BufferedIOBase, RawIOBase)): - handles.append(g) - f = g + is_wrapped = False + if is_text and ( + compression + or isinstance(f, need_text_wrapping) + or "b" in getattr(f, "mode", "") + ): + f = TextIOWrapper( + f, encoding=encoding, errors=errors, newline="" # type: ignore[arg-type] + ) + handles.append(f) + # do not mark as wrapped when the user provided a string + is_wrapped = not is_path if memory_map and hasattr(f, "fileno"): + assert not isinstance(f, str) try: - wrapped = _MMapWrapper(f) + wrapped = cast(mmap.mmap, _MMapWrapper(f)) # type: ignore[arg-type] f.close() handles.remove(f) handles.append(wrapped) @@ -625,7 +709,13 @@ def get_handle( # leave the file handler as is then pass - return f, handles + handles.reverse() # close the most recently added buffer first + assert not isinstance(f, str) + return IOHandles( + handle=f, + created_handles=handles, + is_wrapped=is_wrapped, + ) # error: Definition of "__exit__" in base class "ZipFile" is incompatible with diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3461652f4ea24..03c61c3ed8376 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -17,6 +17,7 @@ from pandas.core.frame import DataFrame from pandas.io.common import ( + IOArgs, get_filepath_or_buffer, is_url, stringify_path, @@ -349,24 +350,37 @@ def read_excel( class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + self.ioargs = IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=None, + mode=None, + compression={"method": None}, + ) # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): - filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + self.ioargs = IOArgs( + filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), + should_close=True, + encoding=None, + mode=None, + compression={"method": None}, + ) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - ).filepath_or_buffer + ) - if isinstance(filepath_or_buffer, self._workbook_class): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): + if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): + self.book = self.ioargs.filepath_or_buffer + elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - filepath_or_buffer.seek(0) - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, str): - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, bytes): - self.book = self.load_workbook(BytesIO(filepath_or_buffer)) + assert not isinstance(self.ioargs.filepath_or_buffer, str) + self.ioargs.filepath_or_buffer.seek(0) + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, str): + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -382,7 +396,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - pass + self.ioargs.close() @property @abc.abstractmethod diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 9a42b8289ab47..198acd5862d45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -81,9 +81,7 @@ def to_feather( feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() def read_feather( @@ -137,9 +135,6 @@ def read_feather( ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) ) - # s3fs only validates the credentials when the file is closed. - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6c62d6825bc84..20226dbb3c9d4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,7 +3,6 @@ """ import csv as csvlib -from io import StringIO, TextIOWrapper import os from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union @@ -39,7 +38,7 @@ class CSVFormatter: def __init__( self, formatter: "DataFrameFormatter", - path_or_buf: Optional[FilePathOrBuffer[str]] = None, + path_or_buf: FilePathOrBuffer[str] = "", sep: str = ",", cols: Optional[Sequence[Label]] = None, index_label: Optional[IndexLabel] = None, @@ -60,25 +59,14 @@ def __init__( self.obj = self.fmt.frame - self.encoding = encoding or "utf-8" - - if path_or_buf is None: - path_or_buf = StringIO() - - ioargs = get_filepath_or_buffer( + self.ioargs = get_filepath_or_buffer( path_or_buf, - encoding=self.encoding, + encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) - self.compression = ioargs.compression.pop("method") - self.compression_args = ioargs.compression - self.path_or_buf = ioargs.filepath_or_buffer - self.should_close = ioargs.should_close - self.mode = ioargs.mode - self.sep = sep self.index_label = self._initialize_index_label(index_label) self.errors = errors @@ -238,20 +226,19 @@ def save(self) -> None: """ Create the writer & save. """ - # get a handle or wrap an existing handle to take care of 1) compression and - # 2) text -> byte conversion - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, + # apply compression and byte/text conversion + handles = get_handle( + self.ioargs.filepath_or_buffer, + self.ioargs.mode, + encoding=self.ioargs.encoding, errors=self.errors, - compression=dict(self.compression_args, method=self.compression), + compression=self.ioargs.compression, ) try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - f, + handles.handle, # type: ignore[arg-type] lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -263,23 +250,10 @@ def save(self) -> None: self._save() finally: - if self.should_close: - f.close() - elif ( - isinstance(f, TextIOWrapper) - and not f.closed - and f != self.path_or_buf - and hasattr(self.path_or_buf, "write") - ): - # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper - # closes the wrapped handle if it is not detached. - f.flush() # make sure everything is written - f.detach() # makes f unusable - del f - elif f != self.path_or_buf: - f.close() - for _fh in handles: - _fh.close() + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + self.ioargs.close() def _save(self) -> None: if self._need_to_save_header: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3c759f477899b..43e76d0aef490 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1046,8 +1046,12 @@ def to_csv( """ from pandas.io.formats.csvs import CSVFormatter + created_buffer = path_or_buf is None + if created_buffer: + path_or_buf = StringIO() + csv_formatter = CSVFormatter( - path_or_buf=path_or_buf, + path_or_buf=path_or_buf, # type: ignore[arg-type] line_terminator=line_terminator, sep=sep, encoding=encoding, @@ -1067,9 +1071,11 @@ def to_csv( ) csv_formatter.save() - if path_or_buf is None: - assert isinstance(csv_formatter.path_or_buf, StringIO) - return csv_formatter.path_or_buf.getvalue() + if created_buffer: + assert isinstance(path_or_buf, StringIO) + content = path_or_buf.getvalue() + path_or_buf.close() + return content return None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 98b9a585d890e..bfb57f415db3b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from collections import abc import functools -from io import BytesIO, StringIO +from io import StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Mapping, Optional, Tuple, Type, Union +from typing import Any, Callable, Mapping, Optional, Tuple, Type, Union import numpy as np @@ -26,7 +26,12 @@ from pandas.core.generic import NDFrame from pandas.core.reshape.concat import concat -from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle +from pandas.io.common import ( + IOHandles, + get_compression_method, + get_filepath_or_buffer, + get_handle, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import validate_integer @@ -59,17 +64,6 @@ def to_json( "'index=False' is only valid when 'orient' is 'split' or 'table'" ) - if path_or_buf is not None: - ioargs = get_filepath_or_buffer( - path_or_buf, - compression=compression, - mode="wt", - storage_options=storage_options, - ) - path_or_buf = ioargs.filepath_or_buffer - should_close = ioargs.should_close - compression = ioargs.compression - if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -101,20 +95,27 @@ def to_json( if lines: s = convert_to_line_delimits(s) - if isinstance(path_or_buf, str): - fh, handles = get_handle(path_or_buf, "w", compression=compression) + if path_or_buf is not None: + # open fsspec URLs + ioargs = get_filepath_or_buffer( + path_or_buf, + compression=compression, + mode="wt", + storage_options=storage_options, + ) + # apply compression and byte/text conversion + handles = get_handle( + ioargs.filepath_or_buffer, "w", compression=ioargs.compression + ) try: - fh.write(s) + handles.handle.write(s) finally: - fh.close() - for handle in handles: - handle.close() - elif path_or_buf is None: - return s + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() else: - path_or_buf.write(s) - if should_close: - path_or_buf.close() + return s class Writer(ABC): @@ -545,12 +546,10 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True - if encoding is None: - encoding = "utf-8" ioargs = get_filepath_or_buffer( path_or_buf, - encoding=encoding, + encoding=encoding or "utf-8", compression=compression, storage_options=storage_options, ) @@ -577,9 +576,7 @@ def read_json( return json_reader result = json_reader.read() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - ioargs.filepath_or_buffer.close() + ioargs.close() return result @@ -629,9 +626,8 @@ def __init__( self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 - self.should_close = False self.nrows = nrows - self.file_handles: List[IO] = [] + self.handles: Optional[IOHandles] = None if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) @@ -670,30 +666,25 @@ def _get_data_from_filepath(self, filepath_or_buffer): This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ - data = filepath_or_buffer - + # if it is a string but the file does not exist, it might be a JSON string exists = False - if isinstance(data, str): + if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): pass - if exists or self.compression["method"] is not None: - data, self.file_handles = get_handle( + if exists or not isinstance(filepath_or_buffer, str): + self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, ) - self.should_close = True - self.open_stream = data - - if isinstance(data, BytesIO): - data = data.getvalue().decode() + filepath_or_buffer = self.handles.handle - return data + return filepath_or_buffer def _combine_lines(self, lines) -> str: """ @@ -757,13 +748,8 @@ def close(self): If an open stream or file was passed, we leave it open. """ - if self.should_close: - try: - self.open_stream.close() - except (OSError, AttributeError): - pass - for file_handle in self.file_handles: - file_handle.close() + if self.handles is not None: + self.handles.close() def __next__(self): if self.nrows: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 829ff6408d86d..5a734f0878a0c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -53,4 +53,5 @@ def read_orc( ioargs = get_filepath_or_buffer(path) orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) result = orc_file.read(columns=columns, **kwargs).to_pandas() + ioargs.close() return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2110a2d400be8..3b72869188344 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO, TextIOWrapper +from io import StringIO import itertools import re import sys @@ -63,7 +63,13 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + stringify_path, + validate_header_arg, +) from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -428,17 +434,16 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - encoding = kwds.get("encoding", None) storage_options = kwds.get("storage_options", None) - if encoding is not None: - encoding = re.sub("_", "-", encoding).lower() - kwds["encoding"] = encoding - compression = kwds.get("compression", "infer") ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options + filepath_or_buffer, + kwds.get("encoding", None), + kwds.get("compression", "infer"), + storage_options=storage_options, ) kwds["compression"] = ioargs.compression + kwds["encoding"] = ioargs.encoding if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -461,14 +466,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): try: data = parser.read(nrows) finally: + # close compression and byte/text wrapper parser.close() - - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close any fsspec-like objects + ioargs.close() return data @@ -1350,10 +1351,6 @@ def __init__(self, kwds): self._first_chunk = True - # GH 13932 - # keep references to file handles opened by the parser itself - self.handles = [] - def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ Check if parse_dates are in columns. @@ -1403,8 +1400,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: ) def close(self): - for f in self.handles: - f.close() + self.handles.close() @property def _has_complex_date_col(self): @@ -1838,23 +1834,29 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - encoding = kwds.get("encoding") + if kwds.get("memory_map", False): + # memory-mapped files are directly handled by the TextReader. + src = stringify_path(src) - # parsers.TextReader doesn't support compression dicts - if isinstance(kwds.get("compression"), dict): - kwds["compression"] = kwds["compression"]["method"] - - if kwds.get("compression") is None and encoding: - if isinstance(src, str): - src = open(src, "rb") - self.handles.append(src) - - # Handle the file object with universal line mode enabled. - # We will handle the newline character ourselves later on. - if hasattr(src, "read") and not hasattr(src, "encoding"): - src = TextIOWrapper(src, encoding=encoding, newline="") + if get_compression_method(kwds.get("compression", None))[0] is not None: + raise ValueError( + "read_csv does not support compression with memory_map=True. " + + "Please use memory_map=False instead." + ) - kwds["encoding"] = "utf-8" + self.handles = get_handle( + src, + mode="r", + encoding=kwds.get("encoding", None), + compression=kwds.get("compression", None), + memory_map=kwds.get("memory_map", False), + is_text=True, + ) + kwds.pop("encoding", None) + kwds.pop("memory_map", None) + kwds.pop("compression", None) + if kwds.get("memory_map", False) and hasattr(self.handles.handle, "mmap"): + self.handles.handle = self.handles.handle.mmap # #2442 kwds["allow_leading_cols"] = self.index_col is not False @@ -1863,7 +1865,7 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) kwds["usecols"] = self.usecols - self._reader = parsers.TextReader(src, **kwds) + self._reader = parsers.TextReader(self.handles.handle, **kwds) self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None @@ -1942,11 +1944,10 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 - def close(self): - for f in self.handles: - f.close() + def close(self) -> None: + super().close() - # close additional handles opened by C parser (for compression) + # close additional handles opened by C parser try: self._reader.close() except ValueError: @@ -2237,20 +2238,19 @@ def __init__(self, f, **kwds): self.comment = kwds["comment"] self._comment_lines = [] - f, handles = get_handle( + self.handles = get_handle( f, "r", encoding=self.encoding, compression=self.compression, memory_map=self.memory_map, ) - self.handles.extend(handles) # Set self.data to something that can read lines. - if hasattr(f, "readline"): - self._make_reader(f) + if hasattr(self.handles.handle, "readline"): + self._make_reader(self.handles.handle) else: - self.data = f + self.data = self.handles.handle # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 426a40a65b522..6fa044b4651a5 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -92,25 +92,18 @@ def to_pickle( mode="wb", storage_options=storage_options, ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - pickle.dump(obj, f, protocol=protocol) + pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] finally: - if f != filepath_or_buffer: - # do not close user-provided file objects GH 35679 - f.close() - for _f in fh: - _f.close() - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - try: - ioargs.filepath_or_buffer.close() - except ValueError: - pass + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + ioargs.close() def read_pickle( @@ -193,7 +186,7 @@ def read_pickle( ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - f, fh = get_handle( + handles = get_handle( ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False ) @@ -208,24 +201,17 @@ def read_pickle( with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) - return pickle.load(f) + return pickle.load(handles.handle) # type: ignore[arg-type] except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on None: def close(self) -> None: """ close the handle if its open """ - try: - self.path_or_buf.close() - except OSError: - pass + self.ioargs.close() def _set_encoding(self) -> None: """ @@ -1938,7 +1936,7 @@ def _open_file_binary_write( fname: FilePathOrBuffer, compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, CompressionOptions]: +) -> Tuple[IOHandles, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1958,34 +1956,22 @@ def _open_file_binary_write( docs for the set of allowed keys and values .. versionadded:: 1.2.0 - - Returns - ------- - file : file-like object - File object supporting write - own : bool - True if the file was created, otherwise False """ - if hasattr(fname, "write"): - # See https://github.com/python/mypy/issues/1424 for hasattr challenges - # error: Incompatible return value type (got "Tuple[Union[str, Path, - # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str, - # Mapping[str, str], None]]") - return fname, False, None # type: ignore[return-value] - elif isinstance(fname, (str, Path)): - # Extract compression mode as given, if dict - ioargs = get_filepath_or_buffer( - fname, mode="wb", compression=compression, storage_options=storage_options - ) - f, _ = get_handle( - ioargs.filepath_or_buffer, - "wb", - compression=ioargs.compression, - is_text=False, - ) - return f, True, ioargs.compression - else: - raise TypeError("fname must be a binary file, buffer or path-like.") + ioargs = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options + ) + handles = get_handle( + ioargs.filepath_or_buffer, + "wb", + compression=ioargs.compression, + is_text=False, + ) + if ioargs.filepath_or_buffer != fname and not isinstance( + ioargs.filepath_or_buffer, str + ): + # add handle created by get_filepath_or_buffer + handles.created_handles.append(ioargs.filepath_or_buffer) + return handles, ioargs.compression def _set_endianness(endianness: str) -> str: @@ -2236,9 +2222,8 @@ def __init__( self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels - self._own_file = True self._compression = compression - self._output_file: Optional[BinaryIO] = None + self._output_file: Optional[Buffer] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) self.storage_options = storage_options @@ -2249,21 +2234,20 @@ def __init__( self._fname = stringify_path(fname) self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} self._converted_names: Dict[Label, str] = {} - self._file: Optional[BinaryIO] = None def _write(self, to_write: str) -> None: """ Helper to call encode before writing to file for Python 3 compat. """ - assert self._file is not None - self._file.write(to_write.encode(self._encoding)) + self.handles.handle.write( + to_write.encode(self._encoding) # type: ignore[arg-type] + ) def _write_bytes(self, value: bytes) -> None: """ Helper to assert file is open before writing. """ - assert self._file is not None - self._file.write(value) + self.handles.handle.write(value) # type: ignore[arg-type] def _prepare_categoricals(self, data: DataFrame) -> DataFrame: """ @@ -2527,12 +2511,14 @@ def _encode_strings(self) -> None: self.data[col] = encoded def write_file(self) -> None: - self._file, self._own_file, compression = _open_file_binary_write( + self.handles, compression = _open_file_binary_write( self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: - self._output_file = self._file - self._file = BytesIO() + # ZipFile creates a file (with the same name) for each write call. + # Write it first into a buffer and then write the buffer to the ZipFile. + self._output_file = self.handles.handle + self.handles.handle = BytesIO() try: self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() @@ -2552,10 +2538,9 @@ def write_file(self) -> None: self._write_map() except Exception as exc: self._close() - if self._own_file: + if isinstance(self._fname, (str, Path)): try: - if isinstance(self._fname, (str, Path)): - os.unlink(self._fname) + os.unlink(self._fname) except OSError: warnings.warn( f"This save was not successful but {self._fname} could not " @@ -2571,24 +2556,18 @@ def _close(self) -> None: Close the file if it was created by the writer. If a buffer or file-like object was passed in, for example a GzipFile, - then leave this file open for the caller to close. In either case, - attempt to flush the file contents to ensure they are written to disk - (if supported) + then leave this file open for the caller to close. """ - # Some file-like objects might not support flush - assert self._file is not None + # write compression if self._output_file is not None: - assert isinstance(self._file, BytesIO) - bio = self._file + assert isinstance(self.handles.handle, BytesIO) + bio = self.handles.handle bio.seek(0) - self._file = self._output_file - self._file.write(bio.read()) - try: - self._file.flush() - except AttributeError: - pass - if self._own_file: - self._file.close() + self.handles.handle = self._output_file + self.handles.handle.write(bio.read()) # type: ignore[arg-type] + bio.close() + # close any created handles + self.handles.close() def _write_map(self) -> None: """No-op, future compatibility""" @@ -3140,8 +3119,8 @@ def _tag(val: Union[str, bytes], tag: str) -> bytes: def _update_map(self, tag: str) -> None: """Update map location for tag with file position""" - assert self._file is not None - self._map[tag] = self._file.tell() + assert self.handles.handle is not None + self._map[tag] = self.handles.handle.tell() def _write_header( self, @@ -3208,12 +3187,11 @@ def _write_map(self) -> None: the map with 0s. The second call writes the final map locations when all blocks have been written. """ - assert self._file is not None if not self._map: self._map = dict( ( ("stata_data", 0), - ("map", self._file.tell()), + ("map", self.handles.handle.tell()), ("variable_types", 0), ("varnames", 0), ("sortlist", 0), @@ -3229,7 +3207,7 @@ def _write_map(self) -> None: ) ) # Move to start of map - self._file.seek(self._map["map"]) + self.handles.handle.seek(self._map["map"]) bio = BytesIO() for val in self._map.values(): bio.write(struct.pack(self._byteorder + "Q", val)) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5bf1ce508dfc4..3103f6e1ba0b1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1034,11 +1034,12 @@ def test_to_csv_compression(self, df, encoding, compression): tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - df.to_csv(f, encoding=encoding) + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + handles.close() result = pd.read_csv( filename, compression=compression, diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 933bdc462e3f8..2e68d3306c7d1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -143,7 +143,7 @@ def test_readjson_chunks_closes(chunksize): ) reader.read() assert ( - reader.open_stream.closed + reader.handles.handle.closed ), f"didn't close stream with chunksize = {chunksize}" diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b33289213e258..e61a5fce99c69 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -6,7 +6,7 @@ import csv from datetime import datetime from inspect import signature -from io import StringIO +from io import BytesIO, StringIO import os import platform from urllib.error import URLError @@ -2253,3 +2253,62 @@ def test_dict_keys_as_names(all_parsers): result = parser.read_csv(StringIO(data), names=keys) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_read_csv_file_handle(all_parsers, io_class, encoding): + """ + Test whether read_csv does not close user-provided file handles. + + GH 36980 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + content = "a,b\n1,2" + if io_class == BytesIO: + content = content.encode("utf-8") + handle = io_class(content) + + tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) + assert not handle.closed + + +def test_memory_map_compression_error(c_parser_only): + """ + c-parsers do not support memory_map=True with compression. + + GH 36997 + """ + parser = c_parser_only + df = DataFrame({"a": [1], "b": [2]}) + msg = ( + "read_csv does not support compression with memory_map=True. " + + "Please use memory_map=False instead." + ) + + with tm.ensure_clean() as path: + df.to_csv(path, compression="gzip", index=False) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression="gzip") + + +def test_memory_map_file_handle(all_parsers): + """ + Support some buffers with memory_map=True. + + GH 36997 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + handle = StringIO() + expected.to_csv(handle, index=False) + handle.seek(0) + + tm.assert_frame_equal( + parser.read_csv(handle, memory_map=True), + expected, + ) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 876696ecdad9c..e74265da3e966 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -152,14 +152,17 @@ def test_binary_mode_file_buffers( with open(fpath, mode="r", encoding=encoding) as fa: result = parser.read_csv(fa) + assert not fa.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb") as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) with open(fpath, mode="rb", buffering=0) as fb: result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed tm.assert_frame_equal(expected, result) @@ -199,6 +202,7 @@ def test_encoding_named_temp_file(all_parsers): result = parser.read_csv(f, encoding=encoding) tm.assert_frame_equal(result, expected) + assert not f.closed @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 1c2518646bb29..413b78a52ad38 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -31,13 +31,10 @@ def test_file_handle(self): reader = TextReader(f) reader.read() - def test_string_filename(self): - reader = TextReader(self.csv1, header=None) - reader.read() - def test_file_handle_mmap(self): + # this was never using memory_map=True with open(self.csv1, "rb") as f: - reader = TextReader(f, memory_map=True, header=None) + reader = TextReader(f, header=None) reader.read() def test_StringIO(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 31e9ad4cf4416..8d7d5d85cbb48 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -47,18 +47,18 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=compression_only) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=None) - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed + handles = icom.get_handle(path, "w", compression=None) + getattr(obj, method)(handles.handle) + assert not handles.handle.closed + handles.close() + assert handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @@ -111,10 +111,10 @@ def test_compression_warning(compression_only): columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: - f, handles = icom.get_handle(path, "w", compression=compression_only) + handles = icom.get_handle(path, "w", compression=compression_only) with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): - with f: - df.to_csv(f, compression=compression_only) + df.to_csv(handles.handle, compression=compression_only) + handles.close() def test_compression_binary(compression_only): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index a72e860340f25..714173158f4d6 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -143,11 +143,11 @@ def test_to_csv_compression(self, s, encoding, compression): tm.assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = get_handle( + handles = get_handle( filename, "w", compression=compression, encoding=encoding ) - with f: - s.to_csv(f, encoding=encoding, header=True) + s.to_csv(handles.handle, encoding=encoding, header=True) + handles.close() result = pd.read_csv( filename, compression=compression,