diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index c7113e663789b..437e75be0e55b 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -10,7 +10,10 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePath, + ReadPickleBuffer, +) from pandas.compat import ( get_lzma_file, import_lzma, @@ -277,7 +280,7 @@ def can_connect(url, error_classes=None): def round_trip_pickle( - obj: Any, path: FilePathOrBuffer | None = None + obj: Any, path: FilePath | ReadPickleBuffer | None = None ) -> DataFrame | Series: """ Pickle an object and then read it again. diff --git a/pandas/_typing.py b/pandas/_typing.py index 85e29681285f4..89e1c0bf7a71f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,28 +1,24 @@ +from __future__ import annotations + from datetime import ( datetime, timedelta, tzinfo, ) -from io import ( - BufferedIOBase, - RawIOBase, - TextIOBase, -) -from mmap import mmap from os import PathLike from typing import ( - IO, TYPE_CHECKING, Any, - AnyStr, Callable, Collection, Dict, Hashable, + Iterator, List, Literal, Mapping, Optional, + Protocol, Sequence, Tuple, Type as type_t, @@ -169,9 +165,76 @@ PythonFuncType = Callable[[Any], Any] # filenames and file-like-objects -Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, mmap] -FileOrBuffer = Union[str, Buffer[AnyStr]] -FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[AnyStr]] +AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True) +AnyStr_con = TypeVar("AnyStr_con", str, bytes, contravariant=True) + + +class BaseBuffer(Protocol): + @property + def mode(self) -> str: + # for _get_filepath_or_buffer + ... + + def fileno(self) -> int: + # for _MMapWrapper + ... + + def seek(self, __offset: int, __whence: int = ...) -> int: + # with one argument: gzip.GzipFile, bz2.BZ2File + # with two arguments: zip.ZipFile, read_sas + ... + + def seekable(self) -> bool: + # for bz2.BZ2File + ... + + def tell(self) -> int: + # for zip.ZipFile, read_stata, to_stata + ... + + +class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]): + def read(self, __n: int | None = ...) -> AnyStr_cov: + # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File + ... + + +class WriteBuffer(BaseBuffer, Protocol[AnyStr_con]): + def write(self, __b: AnyStr_con) -> Any: + # for gzip.GzipFile, bz2.BZ2File + ... + + def flush(self) -> Any: + # for gzip.GzipFile, bz2.BZ2File + ... + + +class ReadPickleBuffer(ReadBuffer[bytes], Protocol): + def readline(self) -> AnyStr_cov: + ... + + +class WriteExcelBuffer(WriteBuffer[bytes], Protocol): + def truncate(self, size: int | None = ...) -> int: + ... + + +class ReadCsvBuffer(ReadBuffer[AnyStr_cov], Protocol): + def __iter__(self) -> Iterator[AnyStr_cov]: + # for engine=python + ... + + def readline(self) -> AnyStr_cov: + # for engine=python + ... + + @property + def closed(self) -> bool: + # for enine=pyarrow + ... + + +FilePath = Union[str, "PathLike[str]"] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b88c97b8e988d..0d1b8e995f18c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,7 +16,6 @@ import functools from io import StringIO import itertools -import mmap from textwrap import dedent from typing import ( IO, @@ -55,7 +54,7 @@ CompressionOptions, Dtype, DtypeObj, - FilePathOrBuffer, + FilePath, FillnaOptions, FloatFormatType, FormattersType, @@ -71,6 +70,7 @@ TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, + WriteBuffer, npt, ) from pandas.compat._optional import import_optional_dependency @@ -1056,7 +1056,7 @@ def _repr_html_(self) -> str | None: @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_string( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, columns: Sequence[str] | None = None, col_space: int | None = None, header: bool | Sequence[str] = True, @@ -2432,7 +2432,7 @@ def _from_arrays( @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, - path: FilePathOrBuffer, + path: FilePath | WriteBuffer[bytes], convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, byteorder: str | None = None, @@ -2454,11 +2454,9 @@ def to_stata( Parameters ---------- - path : str, buffer or path object - String, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() function. If using a buffer - then the buffer will not be automatically closed after the file - data has been written. + path : str, path object, or buffer + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. .. versionchanged:: 1.0.0 @@ -2600,14 +2598,16 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path: FilePathOrBuffer[bytes], **kwargs) -> None: + def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: """ Write a DataFrame to the binary Feather format. Parameters ---------- - path : str or file-like object - If a string, it will be used as Root Directory path. + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If a string or a path, + it will be used as Root Directory path when writing a partitioned dataset. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. Starting with pyarrow 0.17, this includes the `compression`, @@ -2677,15 +2677,14 @@ def to_markdown( return result with get_handle(buf, mode, storage_options=storage_options) as handles: - assert not isinstance(handles.handle, (str, mmap.mmap)) - handles.handle.writelines(result) + handles.handle.write(result) return None @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path: FilePathOrBuffer | None = None, + path: FilePath | WriteBuffer[bytes] | None = None, engine: str = "auto", compression: str | None = "snappy", index: bool | None = None, @@ -2703,13 +2702,11 @@ def to_parquet( Parameters ---------- - path : str or file-like object, default None - If a string, it will be used as Root Directory path - when writing a partitioned dataset. By file-like object, - we refer to objects with a write() method, such as a file handle - (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. If path is None, - a bytes object is returned. + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. .. versionchanged:: 1.2.0 @@ -2804,7 +2801,7 @@ def to_parquet( @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, columns: Sequence[str] | None = None, col_space: ColspaceArgType | None = None, header: bool | Sequence[str] = True, @@ -2891,7 +2888,7 @@ def to_html( @doc(storage_options=generic._shared_docs["storage_options"]) def to_xml( self, - path_or_buffer: FilePathOrBuffer | None = None, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, index: bool = True, root_name: str | None = "data", row_name: str | None = "row", @@ -2904,7 +2901,7 @@ def to_xml( xml_declaration: bool | None = True, pretty_print: bool | None = True, parser: str | None = "lxml", - stylesheet: FilePathOrBuffer | None = None, + stylesheet: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> str | None: @@ -2915,9 +2912,10 @@ def to_xml( Parameters ---------- - path_or_buffer : str, path object or file-like object, optional - File to write output to. If None, the output is returned as a - string. + path_or_buffer : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``write()`` function. If None, the result is returned + as a string. index : bool, default True Whether to include index in XML document. root_name : str, default 'data' @@ -3211,7 +3209,7 @@ def to_xml( def info( self, verbose: bool | None = None, - buf: IO[str] | None = None, + buf: WriteBuffer[str] | None = None, max_cols: int | None = None, memory_usage: bool | str | None = None, show_counts: bool | None = None, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fd8af2c0cedd0..0945193673107 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,7 +12,6 @@ from typing import ( TYPE_CHECKING, Any, - AnyStr, Callable, Hashable, Literal, @@ -44,7 +43,7 @@ Dtype, DtypeArg, DtypeObj, - FilePathOrBuffer, + FilePath, IndexKeyFunc, IndexLabel, JSONSerializable, @@ -58,6 +57,7 @@ TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, + WriteBuffer, npt, ) from pandas.compat._optional import import_optional_dependency @@ -2332,7 +2332,7 @@ def to_excel( @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, - path_or_buf: FilePathOrBuffer | None = None, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, orient: str | None = None, date_format: str | None = None, double_precision: int = 10, @@ -2353,9 +2353,10 @@ def to_json( Parameters ---------- - path_or_buf : str or file handle, optional - File path or object. If not specified, the result is returned as - a string. + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. orient : str Indication of expected JSON string format. @@ -3337,7 +3338,7 @@ def to_latex( @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, - path_or_buf: FilePathOrBuffer[AnyStr] | None = None, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, sep: str = ",", na_rep: str = "", float_format: str | None = None, @@ -3364,10 +3365,11 @@ def to_csv( Parameters ---------- - path_or_buf : str or file handle, default None - File path or object, if None is provided the result is returned as - a string. If a non-binary file object is passed, it should be opened - with `newline=''`, disabling universal newlines. If a binary + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary file object is passed, `mode` might need to contain a `'b'`. .. versionchanged:: 1.2.0 diff --git a/pandas/io/common.py b/pandas/io/common.py index 1e928d1f2cd9e..fa94319c75fa9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -26,6 +26,7 @@ Generic, Literal, Mapping, + TypeVar, cast, overload, ) @@ -40,12 +41,13 @@ import zipfile from pandas._typing import ( - Buffer, + BaseBuffer, CompressionDict, CompressionOptions, - FileOrBuffer, - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteBuffer, ) from pandas.compat import ( get_lzma_file, @@ -61,19 +63,16 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") +BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) + @dataclasses.dataclass class IOArgs: """ Return value of io/common.py:_get_filepath_or_buffer. - - Note (copy&past from io/parsers): - filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - though mypy handling of conditional imports is difficult. - See https://github.com/python/mypy/issues/1297 """ - filepath_or_buffer: FileOrBuffer + filepath_or_buffer: str | BaseBuffer encoding: str mode: str compression: CompressionDict @@ -95,9 +94,10 @@ class IOHandles(Generic[AnyStr]): is_wrapped: Whether a TextIOWrapper needs to be detached. """ - handle: Buffer[AnyStr] + # handle might not implement the IO-interface + handle: IO[AnyStr] compression: CompressionDict - created_handles: list[Buffer] = dataclasses.field(default_factory=list) + created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list) is_wrapped: bool = False is_mmap: bool = False @@ -128,7 +128,7 @@ def __exit__(self, *args: Any) -> None: self.close() -def is_url(url) -> bool: +def is_url(url: object) -> bool: """ Check to see if a URL has a valid protocol. @@ -146,7 +146,17 @@ def is_url(url) -> bool: return parse_url(url).scheme in _VALID_URLS -def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]: +@overload +def _expand_user(filepath_or_buffer: str) -> str: + ... + + +@overload +def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT: + ... + + +def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT: """ Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -174,10 +184,22 @@ def validate_header_arg(header) -> None: ) +@overload +def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str: + ... + + +@overload def stringify_path( - filepath_or_buffer: FilePathOrBuffer[AnyStr], + filepath_or_buffer: BaseBufferT, convert_file_like: bool = ... +) -> BaseBufferT: + ... + + +def stringify_path( + filepath_or_buffer: FilePath | BaseBufferT, convert_file_like: bool = False, -) -> FileOrBuffer[AnyStr]: +) -> str | BaseBufferT: """ Attempt to convert a path-like object to a string. @@ -201,7 +223,7 @@ def stringify_path( # GH 38125: some fsspec objects implement os.PathLike but have already opened a # file. This prevents opening the file a second time. infer_compression calls # this function with convert_file_like=True to infer the compression. - return cast(FileOrBuffer[AnyStr], filepath_or_buffer) + return cast(BaseBufferT, filepath_or_buffer) if isinstance(filepath_or_buffer, os.PathLike): filepath_or_buffer = filepath_or_buffer.__fspath__() @@ -218,7 +240,7 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) -def is_fsspec_url(url: FilePathOrBuffer) -> bool: +def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ Returns true if the given URL looks like something fsspec can handle @@ -231,7 +253,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: def _get_filepath_or_buffer( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | BaseBuffer, encoding: str = "utf-8", compression: CompressionOptions = None, mode: str = "r", @@ -393,7 +415,11 @@ def _get_filepath_or_buffer( mode=mode, ) - if not is_file_like(filepath_or_buffer): + # is_file_like requires (read | write) & __iter__ but __iter__ is only + # needed for read_csv(engine=python) + if not ( + hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write") + ): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) @@ -463,7 +489,7 @@ def get_compression_method( def infer_compression( - filepath_or_buffer: FilePathOrBuffer, compression: str | None + filepath_or_buffer: FilePath | BaseBuffer, compression: str | None ) -> str | None: """ Get the compression method for filepath_or_buffer. If compression='infer', @@ -538,7 +564,7 @@ def check_parent_directory(path: Path | str) -> None: @overload def get_handle( - path_or_buf: FilePathOrBuffer, + path_or_buf: FilePath | BaseBuffer, mode: str, *, encoding: str | None = ..., @@ -553,7 +579,7 @@ def get_handle( @overload def get_handle( - path_or_buf: FilePathOrBuffer, + path_or_buf: FilePath | BaseBuffer, mode: str, *, encoding: str | None = ..., @@ -567,7 +593,7 @@ def get_handle( def get_handle( - path_or_buf: FilePathOrBuffer, + path_or_buf: FilePath | BaseBuffer, mode: str, *, encoding: str | None = None, @@ -649,7 +675,7 @@ def get_handle( ) handle = ioargs.filepath_or_buffer - handles: list[Buffer] + handles: list[BaseBuffer] # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( @@ -677,17 +703,18 @@ def get_handle( if compression == "gzip": if is_path: assert isinstance(handle, str) - handle = gzip.GzipFile( + # error: Incompatible types in assignment (expression has type + # "GzipFile", variable has type "Union[str, BaseBuffer]") + handle = gzip.GzipFile( # type: ignore[assignment] filename=handle, mode=ioargs.mode, **compression_args, ) else: handle = gzip.GzipFile( - # error: Argument "fileobj" to "GzipFile" has incompatible type - # "Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap]]"; expected "Optional[IO[bytes]]" - fileobj=handle, # type: ignore[arg-type] + # No overload variant of "GzipFile" matches argument types + # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" + fileobj=handle, # type: ignore[call-overload] mode=ioargs.mode, **compression_args, ) @@ -706,7 +733,12 @@ def get_handle( # ZIP Compression elif compression == "zip": - handle = _BytesZipFile(handle, ioargs.mode, **compression_args) + # error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str, + # BaseBuffer]"; expected "Union[Union[str, PathLike[str]], + # ReadBuffer[bytes], WriteBuffer[bytes]]" + handle = _BytesZipFile( + handle, ioargs.mode, **compression_args # type: ignore[arg-type] + ) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() @@ -787,10 +819,14 @@ def get_handle( assert not isinstance(ioargs.filepath_or_buffer, str) handles.append(ioargs.filepath_or_buffer) - assert not isinstance(handle, str) return IOHandles( - handle=handle, - created_handles=handles, + # error: Argument "handle" to "IOHandles" has incompatible type + # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes], + # typing.IO[Any]]"; expected "pandas._typing.IO[Any]" + handle=handle, # type: ignore[arg-type] + # error: Argument "created_handles" to "IOHandles" has incompatible type + # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]" + created_handles=handles, # type: ignore[arg-type] is_wrapped=is_wrapped, is_mmap=memory_map, compression=ioargs.compression, @@ -821,7 +857,7 @@ class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] # GH 17778 def __init__( self, - file: FilePathOrBuffer, + file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str, archive_name: str | None = None, **kwargs, @@ -974,15 +1010,15 @@ def detach(self): def _maybe_memory_map( - handle: FileOrBuffer, + handle: str | BaseBuffer, memory_map: bool, encoding: str, mode: str, errors: str | None, decode: bool, -) -> tuple[FileOrBuffer, bool, list[Buffer]]: +) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]: """Try to memory map file/buffer.""" - handles: list[Buffer] = [] + handles: list[BaseBuffer] = [] memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) if not memory_map: return handle, memory_map, handles @@ -1001,10 +1037,11 @@ def _maybe_memory_map( # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" wrapped = cast( - mmap.mmap, + BaseBuffer, _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] ) - handle.close() + # error: "BaseBuffer" has no attribute "close" + handle.close() # type: ignore[attr-defined] handles.remove(handle) handles.append(wrapped) handle = wrapped @@ -1018,7 +1055,7 @@ def _maybe_memory_map( return handle, memory_map, handles -def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: +def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool: """Test whether file exists.""" exists = False filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -1032,7 +1069,7 @@ def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: return exists -def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: +def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: """Whether the handle is opened in binary mode""" # specified by user if "t" in mode or "b" in mode: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 22fbaaaa8b2f8..04052b0fe9fdf 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -6,6 +6,7 @@ import os from textwrap import fill from typing import ( + IO, Any, Mapping, cast, @@ -17,10 +18,11 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( - Buffer, DtypeArg, - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteExcelBuffer, ) from pandas.compat._optional import ( get_version, @@ -816,7 +818,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): # ExcelWriter. def __new__( cls, - path: FilePathOrBuffer | ExcelWriter, + path: FilePath | WriteExcelBuffer | ExcelWriter, engine=None, date_format=None, datetime_format=None, @@ -918,7 +920,7 @@ def save(self): def __init__( self, - path: FilePathOrBuffer | ExcelWriter, + path: FilePath | WriteExcelBuffer | ExcelWriter, engine=None, date_format=None, datetime_format=None, @@ -942,7 +944,7 @@ def __init__( # cast ExcelWriter to avoid adding 'if self.handles is not None' self.handles = IOHandles( - cast(Buffer[bytes], path), compression={"copression": None} + cast(IO[bytes], path), compression={"copression": None} ) if not isinstance(path, ExcelWriter): self.handles = get_handle( @@ -1061,7 +1063,7 @@ def close(self): @doc(storage_options=_shared_docs["storage_options"]) def inspect_excel_format( - content_or_path: FilePathOrBuffer, + content_or_path: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, ) -> str | None: """ @@ -1108,9 +1110,7 @@ def inspect_excel_format( elif not peek.startswith(ZIP_SIGNATURE): return None - # ZipFile typing is overly-strict - # https://github.com/python/typeshed/issues/4212 - zf = zipfile.ZipFile(stream) # type: ignore[arg-type] + zf = zipfile.ZipFile(stream) # Workaround for some third party files that use forward slashes and # lower case names. diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index e0c5a2c6a7ff9..952ad72b480b7 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -3,7 +3,8 @@ import numpy as np from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadBuffer, Scalar, StorageOptions, ) @@ -28,7 +29,7 @@ class ODFReader(BaseExcelReader): def __init__( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, ): import_optional_dependency("odf") @@ -40,7 +41,7 @@ def _workbook_class(self): return OpenDocument - def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): from odf.opendocument import load return load(filepath_or_buffer) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d499f1a5ea89f..f34bf311e6ce7 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -9,7 +9,8 @@ import numpy as np from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadBuffer, Scalar, StorageOptions, ) @@ -505,7 +506,7 @@ def write_cells( class OpenpyxlReader(BaseExcelReader): def __init__( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, ) -> None: """ @@ -527,7 +528,7 @@ def _workbook_class(self): return Workbook - def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): from openpyxl import load_workbook return load_workbook( diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 4b2b9f7a3a678..9284cf917a48c 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -2,7 +2,8 @@ from __future__ import annotations from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadBuffer, Scalar, StorageOptions, ) @@ -14,7 +15,7 @@ class PyxlsbReader(BaseExcelReader): def __init__( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, ): """ @@ -38,7 +39,7 @@ def _workbook_class(self): return Workbook - def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): from pyxlsb import open_workbook # TODO: hack in buffer capability diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 145cbe182eadb..e4547b527a6b9 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,8 +7,10 @@ ) from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteBuffer, ) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc @@ -26,7 +28,7 @@ @doc(storage_options=generic._shared_docs["storage_options"]) def to_feather( df: DataFrame, - path: FilePathOrBuffer[bytes], + path: FilePath | WriteBuffer[bytes], storage_options: StorageOptions = None, **kwargs, ): @@ -36,7 +38,7 @@ def to_feather( Parameters ---------- df : DataFrame - path : string file path, or file-like object + path : str, path object, or file-like object {storage_options} .. versionadded:: 1.2.0 @@ -93,7 +95,7 @@ def to_feather( @doc(storage_options=generic._shared_docs["storage_options"]) def read_feather( - path: FilePathOrBuffer[bytes], + path: FilePath | ReadBuffer[bytes], columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, @@ -103,18 +105,11 @@ def read_feather( Parameters ---------- - path : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: - ``file://localhost/path/to/table.feather``. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. + path : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.feather``. columns : sequence, default None If not provided, all columns are read. use_threads : bool, default True diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index f078975e4b85a..18228a93b5285 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -20,10 +20,11 @@ from pandas._libs import writers as libwriters from pandas._typing import ( CompressionOptions, - FilePathOrBuffer, + FilePath, FloatFormatType, IndexLabel, StorageOptions, + WriteBuffer, ) from pandas.core.dtypes.generic import ( @@ -48,7 +49,7 @@ class CSVFormatter: def __init__( self, formatter: DataFrameFormatter, - path_or_buf: FilePathOrBuffer[str] | FilePathOrBuffer[bytes] = "", + path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "", sep: str = ",", cols: Sequence[Hashable] | None = None, index_label: IndexLabel | None = None, @@ -57,7 +58,7 @@ def __init__( errors: str = "strict", compression: CompressionOptions = "infer", quoting: int | None = None, - line_terminator="\n", + line_terminator: str | None = "\n", chunksize: int | None = None, quotechar: str | None = '"', date_format: str | None = None, @@ -245,7 +246,7 @@ def save(self) -> None: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - handles.handle, # type: ignore[arg-type] + handles.handle, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ca53bfb7d5e08..616331bf80a44 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -19,7 +19,6 @@ IO, TYPE_CHECKING, Any, - AnyStr, Callable, Hashable, Iterable, @@ -51,11 +50,12 @@ ColspaceArgType, ColspaceType, CompressionOptions, - FilePathOrBuffer, + FilePath, FloatFormatType, FormattersType, IndexLabel, StorageOptions, + WriteBuffer, ) from pandas.core.dtypes.common import ( @@ -1021,7 +1021,7 @@ def __init__(self, fmt: DataFrameFormatter): def to_latex( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, column_format: str | None = None, longtable: bool = False, encoding: str | None = None, @@ -1053,7 +1053,7 @@ def to_latex( def to_html( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, encoding: str | None = None, classes: str | list | tuple | None = None, notebook: bool = False, @@ -1066,8 +1066,10 @@ def to_html( Parameters ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. encoding : str, default “utf-8” Set character encoding. classes : str or list-like @@ -1102,7 +1104,7 @@ def to_html( def to_string( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, encoding: str | None = None, line_width: int | None = None, ) -> str | None: @@ -1111,8 +1113,10 @@ def to_string( Parameters ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. encoding: str, default “utf-8” Set character encoding. line_width : int, optional @@ -1126,7 +1130,7 @@ def to_string( def to_csv( self, - path_or_buf: FilePathOrBuffer[AnyStr] | None = None, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, encoding: str | None = None, sep: str = ",", columns: Sequence[Hashable] | None = None, @@ -1186,7 +1190,7 @@ def to_csv( def save_to_buffer( string: str, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, encoding: str | None = None, ) -> str | None: """ @@ -1200,7 +1204,7 @@ def save_to_buffer( @contextmanager -def get_buffer(buf: FilePathOrBuffer[str] | None, encoding: str | None = None): +def get_buffer(buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None): """ Context manager to open, yield and close buffer for filenames or Path-like objects, otherwise yield buf unchanged. @@ -2142,7 +2146,7 @@ def get_level_lengths( return result -def buffer_put_lines(buf: IO[str], lines: list[str]) -> None: +def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None: """ Appends lines to a buffer. diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 64a59778a54f3..ddd2420731028 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -6,7 +6,6 @@ ) import sys from typing import ( - IO, TYPE_CHECKING, Iterable, Iterator, @@ -16,7 +15,10 @@ from pandas._config import get_option -from pandas._typing import Dtype +from pandas._typing import ( + Dtype, + WriteBuffer, +) from pandas.core.indexes.api import Index @@ -171,7 +173,7 @@ def size_qualifier(self) -> str: def render( self, *, - buf: IO[str] | None, + buf: WriteBuffer[str] | None, max_cols: int | None, verbose: bool | None, show_counts: bool | None, @@ -287,7 +289,7 @@ def memory_usage_bytes(self) -> int: def render( self, *, - buf: IO[str] | None, + buf: WriteBuffer[str] | None, max_cols: int | None, verbose: bool | None, show_counts: bool | None, @@ -306,7 +308,7 @@ class InfoPrinterAbstract: Class for printing dataframe or series info. """ - def to_buffer(self, buf: IO[str] | None = None) -> None: + def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None: """Save dataframe info into buffer.""" table_builder = self._create_table_builder() lines = table_builder.get_lines() diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 40803ff14e357..b16e6e6366330 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -21,10 +21,11 @@ from pandas._typing import ( Axis, - FilePathOrBuffer, + FilePath, IndexLabel, Level, Scalar, + WriteBuffer, ) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc @@ -464,7 +465,7 @@ def to_excel( def to_latex( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, *, column_format: str | None = None, position: str | None = None, @@ -488,8 +489,10 @@ def to_latex( Parameters ---------- - buf : str, Path, or StringIO-like, optional, default None - Buffer to write to. If `None`, the output is returned as a string. + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. column_format : str, optional The LaTeX column specification placed in location: @@ -893,7 +896,7 @@ def to_latex( def to_html( self, - buf: FilePathOrBuffer[str] | None = None, + buf: FilePath | WriteBuffer[str] | None = None, *, table_uuid: str | None = None, table_attributes: str | None = None, @@ -915,8 +918,10 @@ def to_html( Parameters ---------- - buf : str, Path, or StringIO-like, optional, default None - Buffer to write to. If ``None``, the output is returned as a string. + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. table_uuid : str, optional Id attribute assigned to the HTML element in the format: diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index ea7d1dfa1645e..b997cd9bddd1e 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -9,8 +9,10 @@ from pandas._typing import ( CompressionOptions, - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteBuffer, ) from pandas.errors import AbstractMethodError @@ -90,7 +92,7 @@ class BaseXMLFormatter: def __init__( self, frame: DataFrame, - path_or_buffer: FilePathOrBuffer | None = None, + path_or_buffer: FilePath | WriteBuffer[bytes] | None = None, index: bool | None = True, root_name: str | None = "data", row_name: str | None = "row", @@ -102,7 +104,7 @@ def __init__( encoding: str = "utf-8", xml_declaration: bool | None = True, pretty_print: bool | None = True, - stylesheet: FilePathOrBuffer | None = None, + stylesheet: FilePath | ReadBuffer[str] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: @@ -272,7 +274,7 @@ def write_output(self) -> str | None: storage_options=self.storage_options, is_text=False, ) as handles: - handles.handle.write(xml_doc) # type: ignore[arg-type] + handles.handle.write(xml_doc) return None else: @@ -582,7 +584,6 @@ def transform_doc(self) -> bytes: conditionally by its specific object type, then transforms original tree with XSLT script. """ - from lxml.etree import ( XSLT, XMLParser, @@ -591,6 +592,7 @@ def transform_doc(self) -> bytes: ) style_doc = self.stylesheet + assert style_doc is not None # is ensured by caller handle_data = get_data_from_filepath( filepath_or_buffer=style_doc, diff --git a/pandas/io/html.py b/pandas/io/html.py index cffe910f1c8ff..7985dcbec9672 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,7 +14,10 @@ Sequence, ) -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, @@ -119,18 +122,21 @@ def _get_skiprows(skiprows: int | Sequence[int] | slice | None): raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") -def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes: +def _read( + obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None +) -> str | bytes: """ Try to read from a url, file or string. Parameters ---------- - obj : str, unicode, or file-like + obj : str, unicode, path object, or file-like object Returns ------- raw_text : str """ + text: str | bytes if ( is_url(obj) or hasattr(obj, "read") @@ -148,9 +154,7 @@ def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes: text = obj else: raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") - # error: Incompatible return value type (got "Union[Any, bytes, None, str]", - # expected "Union[str, bytes]") - return text # type: ignore[return-value] + return text class _HtmlFrameParser: @@ -211,7 +215,7 @@ class _HtmlFrameParser: def __init__( self, - io: FilePathOrBuffer, + io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], match: str | Pattern, attrs: dict[str, str] | None, encoding: str, @@ -944,7 +948,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): @deprecate_nonkeyword_arguments(version="2.0") def read_html( - io: FilePathOrBuffer, + io: FilePath | ReadBuffer[str], match: str | Pattern = ".+", flavor: str | None = None, header: int | Sequence[int] | None = None, @@ -965,8 +969,10 @@ def read_html( Parameters ---------- - io : str, path object or file-like object - A URL, a file-like object, or a raw string containing HTML. Note that + io : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``read()`` function. + The string can represent a URL or the HTML itself. Note that lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 6bdb4df806b5c..6dd4de597c29d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -3,7 +3,10 @@ from typing import TYPE_CHECKING -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.compat._optional import import_optional_dependency from pandas.io.common import get_handle @@ -13,7 +16,7 @@ def read_orc( - path: FilePathOrBuffer, columns: list[str] | None = None, **kwargs + path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs ) -> DataFrame: """ Load an ORC object from the file path, returning a DataFrame. @@ -22,18 +25,12 @@ def read_orc( Parameters ---------- - path : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is + path : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.orc``. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. columns : list, default None If not None, only these columns will be read from the file. **kwargs diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2eb1dd2d44d65..c4b9e36472092 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,15 +3,14 @@ import io import os -from typing import ( - Any, - AnyStr, -) +from typing import Any from warnings import catch_warnings from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteBuffer, ) from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -69,12 +68,14 @@ def get_engine(engine: str) -> BaseImpl: def _get_path_or_handle( - path: FilePathOrBuffer, + path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], fs: Any, storage_options: StorageOptions = None, mode: str = "rb", is_dir: bool = False, -) -> tuple[FilePathOrBuffer, IOHandles[bytes] | None, Any]: +) -> tuple[ + FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any +]: """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: @@ -157,7 +158,7 @@ def __init__(self): def write( self, df: DataFrame, - path: FilePathOrBuffer[AnyStr], + path: FilePath | WriteBuffer[bytes], compression: str | None = "snappy", index: bool | None = None, storage_options: StorageOptions = None, @@ -353,7 +354,7 @@ def read( @doc(storage_options=generic._shared_docs["storage_options"]) def to_parquet( df: DataFrame, - path: FilePathOrBuffer | None = None, + path: FilePath | WriteBuffer[bytes] | None = None, engine: str = "auto", compression: str | None = "snappy", index: bool | None = None, @@ -367,13 +368,12 @@ def to_parquet( Parameters ---------- df : DataFrame - path : str or file-like object, default None - If a string, it will be used as Root Directory path - when writing a partitioned dataset. By file-like object, - we refer to objects with a write() method, such as a file handle - (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. If path is None, - a bytes object is returned. + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string, it will be used as Root Directory path + when writing a partitioned dataset. The engine fastparquet does not + accept file-like objects. .. versionchanged:: 1.2.0 @@ -415,7 +415,7 @@ def to_parquet( partition_cols = [partition_cols] impl = get_engine(engine) - path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path + path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path impl.write( df, @@ -449,21 +449,15 @@ def read_parquet( Parameters ---------- path : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. + The string could be a URL. Valid URL schemes include http, ftp, s3, + gs, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.parquet``. A file URL can also be a path to a directory that contains multiple partitioned parquet files. Both pyarrow and fastparquet support paths to directories as well as file URLs. A directory path could be: ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir`` - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 5b1b178c4f610..9fbeeb74901ef 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -1,6 +1,9 @@ from __future__ import annotations -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.inference import is_integer @@ -16,7 +19,7 @@ class ArrowParserWrapper(ParserBase): Wrapper for the pyarrow engine for read_csv() """ - def __init__(self, src: FilePathOrBuffer, **kwds): + def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds): self.kwds = kwds self.src = src diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 42b9c8c9f10fe..d096e9008112b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -26,7 +26,8 @@ from pandas._typing import ( ArrayLike, DtypeArg, - FilePathOrBuffer, + FilePath, + ReadCsvBuffer, ) from pandas.errors import ( ParserError, @@ -218,7 +219,11 @@ def __init__(self, kwds): # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: + def _open_handles( + self, + src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + kwds: dict[str, Any], + ) -> None: """ Let the readers open IOHandles after they are done with their potential raises. """ diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index db750cded45e5..e96df3b3f3782 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -7,7 +7,8 @@ import pandas._libs.parsers as parsers from pandas._typing import ( ArrayLike, - FilePathOrBuffer, + FilePath, + ReadCsvBuffer, ) from pandas.errors import DtypeWarning from pandas.util._exceptions import find_stack_level @@ -31,7 +32,9 @@ class CParserWrapper(ParserBase): low_memory: bool _reader: parsers.TextReader - def __init__(self, src: FilePathOrBuffer, **kwds): + def __init__( + self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds + ): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 36387f0835f4a..f5420618c0235 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -20,7 +20,8 @@ import pandas._libs.lib as lib from pandas._typing import ( - FilePathOrBuffer, + FilePath, + ReadCsvBuffer, Scalar, ) from pandas.errors import ( @@ -45,7 +46,9 @@ class PythonParser(ParserBase): - def __init__(self, f: FilePathOrBuffer | list, **kwds): + def __init__( + self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, **kwds + ): """ Workhorse function for processing nested list into DataFrame """ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 0b57f0f5ef814..9f555d77948a7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -17,7 +17,8 @@ from pandas._typing import ( ArrayLike, DtypeArg, - FilePathOrBuffer, + FilePath, + ReadCsvBuffer, StorageOptions, ) from pandas.errors import ( @@ -505,7 +506,9 @@ def _validate_names(names): raise ValueError("Names should be an ordered collection.") -def _read(filepath_or_buffer: FilePathOrBuffer, kwds): +def _read( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds +): """Generic reader of line files.""" if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -554,7 +557,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): ) ) def read_csv( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], sep=lib.no_default, delimiter=None, # Column and Index Locations and Names @@ -652,7 +655,7 @@ def read_csv( ) ) def read_table( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], sep=lib.no_default, delimiter=None, # Column and Index Locations and Names @@ -739,7 +742,7 @@ def read_table( def read_fwf( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], colspecs="infer", widths=None, infer_nrows=100, @@ -756,18 +759,12 @@ def read_fwf( Parameters ---------- - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a text ``read()`` function.The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.csv``. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. colspecs : list of tuple (int, int) or 'infer'. optional A list of tuples giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -942,10 +939,10 @@ def _get_options_with_defaults(self, engine): def _check_file_or_buffer(self, f, engine): # see gh-16530 - if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): - # The C engine doesn't need the file-like to have the "__next__" - # attribute. However, the Python engine explicitly calls - # "__next__(...)" when iterating through such an object, meaning it + if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): + # The C engine doesn't need the file-like to have the "__iter__" + # attribute. However, the Python engine needs "__iter__(...)" + # when iterating through such an object, meaning it # needs to have that attribute raise ValueError( "The 'python' engine cannot iterate through this file buffer." diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6a91c12ee286e..8bd0942550e6e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,12 +1,16 @@ """ pickle compat """ +from __future__ import annotations + import pickle from typing import Any import warnings from pandas._typing import ( CompressionOptions, - FilePathOrBuffer, + FilePath, + ReadPickleBuffer, StorageOptions, + WriteBuffer, ) from pandas.compat import pickle_compat as pc from pandas.util._decorators import doc @@ -19,7 +23,7 @@ @doc(storage_options=generic._shared_docs["storage_options"]) def to_pickle( obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | WriteBuffer[bytes], compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -31,8 +35,9 @@ def to_pickle( ---------- obj : any object Any python object. - filepath_or_buffer : str, path object or file-like object - File path, URL, or buffer where the pickled object will be stored. + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. .. versionchanged:: 1.0.0 Accept URL. URL has to be of S3 or GCS. @@ -103,26 +108,15 @@ def to_pickle( # pickle create the entire object and then write it to the buffer. # "zip" would also be here if pandas.io.common._BytesZipFile # wouldn't buffer write calls - handles.handle.write( - # error: Argument 1 to "write" of "TextIOBase" has incompatible type - # "bytes"; expected "str" - pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type] - ) + handles.handle.write(pickle.dumps(obj, protocol=protocol)) else: # letting pickle write directly to the buffer is more memory-efficient - pickle.dump( - # error: Argument 2 to "dump" has incompatible type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]"; expected - # "IO[bytes]" - obj, - handles.handle, # type: ignore[arg-type] - protocol=protocol, - ) + pickle.dump(obj, handles.handle, protocol=protocol) @doc(storage_options=generic._shared_docs["storage_options"]) def read_pickle( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadPickleBuffer, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): @@ -136,8 +130,9 @@ def read_pickle( Parameters ---------- - filepath_or_buffer : str, path object or file-like object - File path, URL, or buffer where the pickled object will be loaded from. + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``readlines()`` function. .. versionchanged:: 1.0.0 Accept URL. URL is not limited to S3 and GCS. @@ -211,10 +206,7 @@ def read_pickle( with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) - # error: Argument 1 to "load" has incompatible type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]"; - # expected "IO[bytes]" - return pickle.load(handles.handle) # type: ignore[arg-type] + return pickle.load(handles.handle) except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 300df9728cd75..cd863cabf5c2d 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -21,14 +21,14 @@ timedelta, ) import struct -from typing import ( - IO, - Any, - cast, -) +from typing import cast import numpy as np +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.errors import ( EmptyDataError, OutOfBoundsDatetime, @@ -159,7 +159,7 @@ class SAS7BDATReader(ReaderBase, abc.Iterator): def __init__( self, - path_or_buf, + path_or_buf: FilePath | ReadBuffer[bytes], index=None, convert_dates=True, blank_missing=True, @@ -179,16 +179,16 @@ def __init__( self.default_encoding = "latin-1" self.compression = b"" - self.column_names_strings = [] - self.column_names = [] - self.column_formats = [] - self.columns = [] + self.column_names_strings: list[str] = [] + self.column_names: list[str] = [] + self.column_formats: list[str] = [] + self.columns: list[_Column] = [] - self._current_page_data_subheader_pointers = [] + self._current_page_data_subheader_pointers: list[_SubheaderPointer] = [] self._cached_page = None - self._column_data_lengths = [] - self._column_data_offsets = [] - self._column_types = [] + self._column_data_lengths: list[int] = [] + self._column_data_offsets: list[int] = [] + self._column_types: list[bytes] = [] self._current_row_in_file_index = 0 self._current_row_on_page_index = 0 @@ -196,7 +196,7 @@ def __init__( self.handles = get_handle(path_or_buf, "rb", is_text=False) - self._path_or_buf = cast(IO[Any], self.handles.handle) + self._path_or_buf = self.handles.handle try: self._get_properties() @@ -227,7 +227,7 @@ def _get_properties(self) -> None: # Check magic number self._path_or_buf.seek(0) - self._cached_page = cast(bytes, self._path_or_buf.read(288)) + self._cached_page = self._path_or_buf.read(288) if self._cached_page[0 : len(const.magic)] != const.magic: raise ValueError("magic number mismatch (not a SAS file?)") @@ -301,7 +301,7 @@ def _get_properties(self) -> None: ) # Read the rest of the header into cached_page. - buf = cast(bytes, self._path_or_buf.read(self.header_length - 288)) + buf = self._path_or_buf.read(self.header_length - 288) self._cached_page += buf # error: Argument 1 to "len" has incompatible type "Optional[bytes]"; # expected "Sized" @@ -400,7 +400,7 @@ def _read_bytes(self, offset: int, length: int): def _parse_metadata(self) -> None: done = False while not done: - self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length)) + self._cached_page = self._path_or_buf.read(self._page_length) if len(self._cached_page) <= 0: break if len(self._cached_page) != self._page_length: @@ -761,7 +761,7 @@ def read(self, nrows: int | None = None) -> DataFrame | None: def _read_next_page(self): self._current_page_data_subheader_pointers = [] - self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length)) + self._cached_page = self._path_or_buf.read(self._page_length) if len(self._cached_page) <= 0: return True elif len(self._cached_page) != self._page_length: @@ -817,7 +817,7 @@ def _chunk_to_dataframe(self) -> DataFrame: js += 1 else: self.close() - raise ValueError(f"unknown column type {self._column_types[j]}") + raise ValueError(f"unknown column type {repr(self._column_types[j])}") df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) return df diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 3f9bf6662e99f..d8a3412e05d05 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -7,17 +7,19 @@ https://support.sas.com/techsup/technote/ts140.pdf """ +from __future__ import annotations + from collections import abc from datetime import datetime import struct -from typing import ( - IO, - cast, -) import warnings import numpy as np +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.util._decorators import Appender import pandas as pd @@ -248,7 +250,11 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + index=None, + encoding: str | None = "ISO-8859-1", + chunksize=None, ): self._encoding = encoding @@ -259,7 +265,7 @@ def __init__( self.handles = get_handle( filepath_or_buffer, "rb", encoding=encoding, is_text=False ) - self.filepath_or_buffer = cast(IO[bytes], self.handles.handle) + self.filepath_or_buffer = self.handles.handle try: self._read_header() diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index b323ce39763a1..f50fc777f55e9 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -13,7 +13,10 @@ overload, ) -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePath, + ReadBuffer, +) from pandas.io.common import stringify_path @@ -44,7 +47,7 @@ def __exit__(self, exc_type, exc_value, traceback): @overload def read_sas( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = ..., index: Hashable | None = ..., encoding: str | None = ..., @@ -56,7 +59,7 @@ def read_sas( @overload def read_sas( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = ..., index: Hashable | None = ..., encoding: str | None = ..., @@ -67,7 +70,7 @@ def read_sas( def read_sas( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, index: Hashable | None = None, encoding: str | None = None, @@ -79,18 +82,12 @@ def read_sas( Parameters ---------- - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 013f17580600d..ff9d8a1be3d1e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -18,6 +18,7 @@ import struct import sys from typing import ( + IO, TYPE_CHECKING, Any, AnyStr, @@ -33,10 +34,11 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array from pandas._typing import ( - Buffer, CompressionOptions, - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, + WriteBuffer, ) from pandas.util._decorators import ( Appender, @@ -1117,7 +1119,7 @@ class StataReader(StataParser, abc.Iterator): def __init__( self, - path_or_buf: FilePathOrBuffer, + path_or_buf: FilePath | ReadBuffer[bytes], convert_dates: bool = True, convert_categoricals: bool = True, index_col: str | None = None, @@ -1168,10 +1170,7 @@ def __init__( compression=compression, ) as handles: # Copy to BytesIO, and ensure no encoding - - # Argument 1 to "BytesIO" has incompatible type "Union[Any, bytes, None, - # str]"; expected "bytes" - self.path_or_buf = BytesIO(handles.handle.read()) # type: ignore[arg-type] + self.path_or_buf = BytesIO(handles.handle.read()) self._read_header() self._setup_dtype() @@ -2002,7 +2001,7 @@ def value_labels(self) -> dict[str, dict[float | int, str]]: @Appender(_read_stata_doc) def read_stata( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer: FilePath | ReadBuffer[bytes], convert_dates: bool = True, convert_categoricals: bool = True, index_col: str | None = None, @@ -2270,7 +2269,7 @@ class StataWriter(StataParser): def __init__( self, - fname: FilePathOrBuffer, + fname: FilePath | WriteBuffer[bytes], data: DataFrame, convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, @@ -2294,7 +2293,7 @@ def __init__( self._value_labels: list[StataValueLabel] = [] self._has_value_labels = np.array([], dtype=bool) self._compression = compression - self._output_file: Buffer[bytes] | None = None + self._output_file: IO[bytes] | None = None self._converted_names: dict[Hashable, str] = {} # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2310,15 +2309,13 @@ def _write(self, to_write: str) -> None: """ Helper to call encode before writing to file for Python 3 compat. """ - self.handles.handle.write( - to_write.encode(self._encoding) # type: ignore[arg-type] - ) + self.handles.handle.write(to_write.encode(self._encoding)) def _write_bytes(self, value: bytes) -> None: """ Helper to assert file is open before writing. """ - self.handles.handle.write(value) # type: ignore[arg-type] + self.handles.handle.write(value) def _prepare_non_cat_value_labels( self, data: DataFrame @@ -2686,7 +2683,7 @@ def _close(self) -> None: if self._output_file is not None: assert isinstance(self.handles.handle, BytesIO) bio, self.handles.handle = self.handles.handle, self._output_file - self.handles.handle.write(bio.getvalue()) # type: ignore[arg-type] + self.handles.handle.write(bio.getvalue()) def _write_map(self) -> None: """No-op, future compatibility""" @@ -3203,7 +3200,7 @@ class StataWriter117(StataWriter): def __init__( self, - fname: FilePathOrBuffer, + fname: FilePath | WriteBuffer[bytes], data: DataFrame, convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, @@ -3605,7 +3602,7 @@ class StataWriterUTF8(StataWriter117): def __init__( self, - fname: FilePathOrBuffer, + fname: FilePath | WriteBuffer[bytes], data: DataFrame, convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, diff --git a/pandas/io/xml.py b/pandas/io/xml.py index bc3436861f1a8..3c3b4afa2c57d 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -7,9 +7,9 @@ import io from pandas._typing import ( - Buffer, CompressionOptions, - FilePathOrBuffer, + FilePath, + ReadBuffer, StorageOptions, ) from pandas.compat._optional import import_optional_dependency @@ -199,9 +199,6 @@ class _EtreeFrameParser(_XMLFrameParser): standard library XML module: `xml.etree.ElementTree`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - def parse_data(self) -> list[dict[str, str | None]]: from xml.etree.ElementTree import XML @@ -571,11 +568,11 @@ def _transform_doc(self) -> bytes: def get_data_from_filepath( - filepath_or_buffer, + filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], encoding, compression, storage_options, -) -> str | bytes | Buffer: +) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: """ Extract raw XML data. @@ -587,7 +584,8 @@ def get_data_from_filepath( This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ - filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, bytes): + filepath_or_buffer = stringify_path(filepath_or_buffer) if ( isinstance(filepath_or_buffer, str) @@ -606,7 +604,10 @@ def get_data_from_filepath( storage_options=storage_options, ) as handle_obj: filepath_or_buffer = ( - handle_obj.handle.read() + # error: Incompatible types in assignment (expression has type + # "Union[str, IO[str]]", variable has type "Union[Union[str, + # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]") + handle_obj.handle.read() # type: ignore[assignment] if hasattr(handle_obj.handle, "read") else handle_obj.handle ) @@ -728,7 +729,7 @@ def _parse( @doc(storage_options=_shared_docs["storage_options"]) def read_xml( - path_or_buffer: FilePathOrBuffer, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str | None = "./*", namespaces: dict | list[dict] | None = None, elems_only: bool | None = False, @@ -736,7 +737,7 @@ def read_xml( names: list[str] | None = None, encoding: str | None = "utf-8", parser: str | None = "lxml", - stylesheet: FilePathOrBuffer | None = None, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame: @@ -748,8 +749,10 @@ def read_xml( Parameters ---------- path_or_buffer : str, path object, or file-like object - Any valid XML string or path is acceptable. The string could be a URL. - Valid URL schemes include http, ftp, s3, and file. + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``read()`` function. The string can be any valid XML + string or a path. The string can further be a URL. Valid URL schemes + include http, ftp, s3, and file. xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index f62c9fd1349bf..df8be721ec871 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -310,3 +310,22 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +def test_python_engine_file_no_next(python_parser_only): + parser = python_parser_only + + class NoNextBuffer: + def __init__(self, csv_data): + self.data = csv_data + + def __iter__(self): + return self.data.__iter__() + + def read(self): + return self.data + + def readline(self): + return self.data + + parser.read_csv(NoNextBuffer("a\n1")) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 1e5cf49ce24ae..89d35499fd597 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -104,22 +104,25 @@ def test_python_engine(self, python_engine): with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine=python_engine, **kwargs) - def test_python_engine_file_no_next(self, python_engine): + def test_python_engine_file_no_iter(self, python_engine): # see gh-16530 class NoNextBuffer: def __init__(self, csv_data): self.data = csv_data - def __iter__(self): - return self + def __next__(self): + return self.data.__next__() def read(self): return self.data + def readline(self): + return self.data + data = "a\n1" - msg = "The 'python' engine cannot iterate" + msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) def test_pyarrow_engine(self): diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index c257b61db296e..b8d146c597d2c 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1287,8 +1287,7 @@ def test_compression_output(parser, comp): output = equalize_decl(output) - # error: Item "None" of "Union[str, bytes, None]" has no attribute "strip" - assert geom_xml == output.strip() # type: ignore[union-attr] + assert geom_xml == output.strip() @pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) @@ -1306,8 +1305,7 @@ def test_filename_and_suffix_comp(parser, comp, compfile): output = equalize_decl(output) - # error: Item "None" of "Union[str, bytes, None]" has no attribute "strip" - assert geom_xml == output.strip() # type: ignore[union-attr] + assert geom_xml == output.strip() def test_unsuported_compression(datapath, parser):