diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
index c7113e663789b..437e75be0e55b 100644
--- a/pandas/_testing/_io.py
+++ b/pandas/_testing/_io.py
@@ -10,7 +10,10 @@
)
import zipfile
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+ FilePath,
+ ReadPickleBuffer,
+)
from pandas.compat import (
get_lzma_file,
import_lzma,
@@ -277,7 +280,7 @@ def can_connect(url, error_classes=None):
def round_trip_pickle(
- obj: Any, path: FilePathOrBuffer | None = None
+ obj: Any, path: FilePath | ReadPickleBuffer | None = None
) -> DataFrame | Series:
"""
Pickle an object and then read it again.
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 85e29681285f4..89e1c0bf7a71f 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -1,28 +1,24 @@
+from __future__ import annotations
+
from datetime import (
datetime,
timedelta,
tzinfo,
)
-from io import (
- BufferedIOBase,
- RawIOBase,
- TextIOBase,
-)
-from mmap import mmap
from os import PathLike
from typing import (
- IO,
TYPE_CHECKING,
Any,
- AnyStr,
Callable,
Collection,
Dict,
Hashable,
+ Iterator,
List,
Literal,
Mapping,
Optional,
+ Protocol,
Sequence,
Tuple,
Type as type_t,
@@ -169,9 +165,76 @@
PythonFuncType = Callable[[Any], Any]
# filenames and file-like-objects
-Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, mmap]
-FileOrBuffer = Union[str, Buffer[AnyStr]]
-FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[AnyStr]]
+AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
+AnyStr_con = TypeVar("AnyStr_con", str, bytes, contravariant=True)
+
+
+class BaseBuffer(Protocol):
+ @property
+ def mode(self) -> str:
+ # for _get_filepath_or_buffer
+ ...
+
+ def fileno(self) -> int:
+ # for _MMapWrapper
+ ...
+
+ def seek(self, __offset: int, __whence: int = ...) -> int:
+ # with one argument: gzip.GzipFile, bz2.BZ2File
+ # with two arguments: zip.ZipFile, read_sas
+ ...
+
+ def seekable(self) -> bool:
+ # for bz2.BZ2File
+ ...
+
+ def tell(self) -> int:
+ # for zip.ZipFile, read_stata, to_stata
+ ...
+
+
+class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]):
+ def read(self, __n: int | None = ...) -> AnyStr_cov:
+ # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File
+ ...
+
+
+class WriteBuffer(BaseBuffer, Protocol[AnyStr_con]):
+ def write(self, __b: AnyStr_con) -> Any:
+ # for gzip.GzipFile, bz2.BZ2File
+ ...
+
+ def flush(self) -> Any:
+ # for gzip.GzipFile, bz2.BZ2File
+ ...
+
+
+class ReadPickleBuffer(ReadBuffer[bytes], Protocol):
+ def readline(self) -> AnyStr_cov:
+ ...
+
+
+class WriteExcelBuffer(WriteBuffer[bytes], Protocol):
+ def truncate(self, size: int | None = ...) -> int:
+ ...
+
+
+class ReadCsvBuffer(ReadBuffer[AnyStr_cov], Protocol):
+ def __iter__(self) -> Iterator[AnyStr_cov]:
+ # for engine=python
+ ...
+
+ def readline(self) -> AnyStr_cov:
+ # for engine=python
+ ...
+
+ @property
+ def closed(self) -> bool:
+ # for enine=pyarrow
+ ...
+
+
+FilePath = Union[str, "PathLike[str]"]
# for arbitrary kwargs passed during reading/writing files
StorageOptions = Optional[Dict[str, Any]]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b88c97b8e988d..0d1b8e995f18c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -16,7 +16,6 @@
import functools
from io import StringIO
import itertools
-import mmap
from textwrap import dedent
from typing import (
IO,
@@ -55,7 +54,7 @@
CompressionOptions,
Dtype,
DtypeObj,
- FilePathOrBuffer,
+ FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
@@ -71,6 +70,7 @@
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
ValueKeyFunc,
+ WriteBuffer,
npt,
)
from pandas.compat._optional import import_optional_dependency
@@ -1056,7 +1056,7 @@ def _repr_html_(self) -> str | None:
@Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
def to_string(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
columns: Sequence[str] | None = None,
col_space: int | None = None,
header: bool | Sequence[str] = True,
@@ -2432,7 +2432,7 @@ def _from_arrays(
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
def to_stata(
self,
- path: FilePathOrBuffer,
+ path: FilePath | WriteBuffer[bytes],
convert_dates: dict[Hashable, str] | None = None,
write_index: bool = True,
byteorder: str | None = None,
@@ -2454,11 +2454,9 @@ def to_stata(
Parameters
----------
- path : str, buffer or path object
- String, path object (pathlib.Path or py._path.local.LocalPath) or
- object implementing a binary write() function. If using a buffer
- then the buffer will not be automatically closed after the file
- data has been written.
+ path : str, path object, or buffer
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function.
.. versionchanged:: 1.0.0
@@ -2600,14 +2598,16 @@ def to_stata(
writer.write_file()
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
- def to_feather(self, path: FilePathOrBuffer[bytes], **kwargs) -> None:
+ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
"""
Write a DataFrame to the binary Feather format.
Parameters
----------
- path : str or file-like object
- If a string, it will be used as Root Directory path.
+ path : str, path object, file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function. If a string or a path,
+ it will be used as Root Directory path when writing a partitioned dataset.
**kwargs :
Additional keywords passed to :func:`pyarrow.feather.write_feather`.
Starting with pyarrow 0.17, this includes the `compression`,
@@ -2677,15 +2677,14 @@ def to_markdown(
return result
with get_handle(buf, mode, storage_options=storage_options) as handles:
- assert not isinstance(handles.handle, (str, mmap.mmap))
- handles.handle.writelines(result)
+ handles.handle.write(result)
return None
@doc(storage_options=generic._shared_docs["storage_options"])
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
def to_parquet(
self,
- path: FilePathOrBuffer | None = None,
+ path: FilePath | WriteBuffer[bytes] | None = None,
engine: str = "auto",
compression: str | None = "snappy",
index: bool | None = None,
@@ -2703,13 +2702,11 @@ def to_parquet(
Parameters
----------
- path : str or file-like object, default None
- If a string, it will be used as Root Directory path
- when writing a partitioned dataset. By file-like object,
- we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function) or io.BytesIO. The engine
- fastparquet does not accept file-like objects. If path is None,
- a bytes object is returned.
+ path : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function. If None, the result is
+ returned as bytes. If a string or path, it will be used as Root Directory
+ path when writing a partitioned dataset.
.. versionchanged:: 1.2.0
@@ -2804,7 +2801,7 @@ def to_parquet(
@Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
def to_html(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
columns: Sequence[str] | None = None,
col_space: ColspaceArgType | None = None,
header: bool | Sequence[str] = True,
@@ -2891,7 +2888,7 @@ def to_html(
@doc(storage_options=generic._shared_docs["storage_options"])
def to_xml(
self,
- path_or_buffer: FilePathOrBuffer | None = None,
+ path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
index: bool = True,
root_name: str | None = "data",
row_name: str | None = "row",
@@ -2904,7 +2901,7 @@ def to_xml(
xml_declaration: bool | None = True,
pretty_print: bool | None = True,
parser: str | None = "lxml",
- stylesheet: FilePathOrBuffer | None = None,
+ stylesheet: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
) -> str | None:
@@ -2915,9 +2912,10 @@ def to_xml(
Parameters
----------
- path_or_buffer : str, path object or file-like object, optional
- File to write output to. If None, the output is returned as a
- string.
+ path_or_buffer : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a ``write()`` function. If None, the result is returned
+ as a string.
index : bool, default True
Whether to include index in XML document.
root_name : str, default 'data'
@@ -3211,7 +3209,7 @@ def to_xml(
def info(
self,
verbose: bool | None = None,
- buf: IO[str] | None = None,
+ buf: WriteBuffer[str] | None = None,
max_cols: int | None = None,
memory_usage: bool | str | None = None,
show_counts: bool | None = None,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index fd8af2c0cedd0..0945193673107 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -12,7 +12,6 @@
from typing import (
TYPE_CHECKING,
Any,
- AnyStr,
Callable,
Hashable,
Literal,
@@ -44,7 +43,7 @@
Dtype,
DtypeArg,
DtypeObj,
- FilePathOrBuffer,
+ FilePath,
IndexKeyFunc,
IndexLabel,
JSONSerializable,
@@ -58,6 +57,7 @@
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
ValueKeyFunc,
+ WriteBuffer,
npt,
)
from pandas.compat._optional import import_optional_dependency
@@ -2332,7 +2332,7 @@ def to_excel(
@doc(storage_options=_shared_docs["storage_options"])
def to_json(
self,
- path_or_buf: FilePathOrBuffer | None = None,
+ path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
orient: str | None = None,
date_format: str | None = None,
double_precision: int = 10,
@@ -2353,9 +2353,10 @@ def to_json(
Parameters
----------
- path_or_buf : str or file handle, optional
- File path or object. If not specified, the result is returned as
- a string.
+ path_or_buf : str, path object, file-like object, or None, default None
+ String, path object (implementing os.PathLike[str]), or file-like
+ object implementing a write() function. If None, the result is
+ returned as a string.
orient : str
Indication of expected JSON string format.
@@ -3337,7 +3338,7 @@ def to_latex(
@doc(storage_options=_shared_docs["storage_options"])
def to_csv(
self,
- path_or_buf: FilePathOrBuffer[AnyStr] | None = None,
+ path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | None = None,
@@ -3364,10 +3365,11 @@ def to_csv(
Parameters
----------
- path_or_buf : str or file handle, default None
- File path or object, if None is provided the result is returned as
- a string. If a non-binary file object is passed, it should be opened
- with `newline=''`, disabling universal newlines. If a binary
+ path_or_buf : str, path object, file-like object, or None, default None
+ String, path object (implementing os.PathLike[str]), or file-like
+ object implementing a write() function. If None, the result is
+ returned as a string. If a non-binary file object is passed, it should
+ be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
.. versionchanged:: 1.2.0
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 1e928d1f2cd9e..fa94319c75fa9 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -26,6 +26,7 @@
Generic,
Literal,
Mapping,
+ TypeVar,
cast,
overload,
)
@@ -40,12 +41,13 @@
import zipfile
from pandas._typing import (
- Buffer,
+ BaseBuffer,
CompressionDict,
CompressionOptions,
- FileOrBuffer,
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.compat import (
get_lzma_file,
@@ -61,19 +63,16 @@
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
+BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
+
@dataclasses.dataclass
class IOArgs:
"""
Return value of io/common.py:_get_filepath_or_buffer.
-
- Note (copy&past from io/parsers):
- filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
- though mypy handling of conditional imports is difficult.
- See https://github.com/python/mypy/issues/1297
"""
- filepath_or_buffer: FileOrBuffer
+ filepath_or_buffer: str | BaseBuffer
encoding: str
mode: str
compression: CompressionDict
@@ -95,9 +94,10 @@ class IOHandles(Generic[AnyStr]):
is_wrapped: Whether a TextIOWrapper needs to be detached.
"""
- handle: Buffer[AnyStr]
+ # handle might not implement the IO-interface
+ handle: IO[AnyStr]
compression: CompressionDict
- created_handles: list[Buffer] = dataclasses.field(default_factory=list)
+ created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list)
is_wrapped: bool = False
is_mmap: bool = False
@@ -128,7 +128,7 @@ def __exit__(self, *args: Any) -> None:
self.close()
-def is_url(url) -> bool:
+def is_url(url: object) -> bool:
"""
Check to see if a URL has a valid protocol.
@@ -146,7 +146,17 @@ def is_url(url) -> bool:
return parse_url(url).scheme in _VALID_URLS
-def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]:
+@overload
+def _expand_user(filepath_or_buffer: str) -> str:
+ ...
+
+
+@overload
+def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT:
+ ...
+
+
+def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:
"""
Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
@@ -174,10 +184,22 @@ def validate_header_arg(header) -> None:
)
+@overload
+def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str:
+ ...
+
+
+@overload
def stringify_path(
- filepath_or_buffer: FilePathOrBuffer[AnyStr],
+ filepath_or_buffer: BaseBufferT, convert_file_like: bool = ...
+) -> BaseBufferT:
+ ...
+
+
+def stringify_path(
+ filepath_or_buffer: FilePath | BaseBufferT,
convert_file_like: bool = False,
-) -> FileOrBuffer[AnyStr]:
+) -> str | BaseBufferT:
"""
Attempt to convert a path-like object to a string.
@@ -201,7 +223,7 @@ def stringify_path(
# GH 38125: some fsspec objects implement os.PathLike but have already opened a
# file. This prevents opening the file a second time. infer_compression calls
# this function with convert_file_like=True to infer the compression.
- return cast(FileOrBuffer[AnyStr], filepath_or_buffer)
+ return cast(BaseBufferT, filepath_or_buffer)
if isinstance(filepath_or_buffer, os.PathLike):
filepath_or_buffer = filepath_or_buffer.__fspath__()
@@ -218,7 +240,7 @@ def urlopen(*args, **kwargs):
return urllib.request.urlopen(*args, **kwargs)
-def is_fsspec_url(url: FilePathOrBuffer) -> bool:
+def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
"""
Returns true if the given URL looks like
something fsspec can handle
@@ -231,7 +253,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool:
def _get_filepath_or_buffer(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | BaseBuffer,
encoding: str = "utf-8",
compression: CompressionOptions = None,
mode: str = "r",
@@ -393,7 +415,11 @@ def _get_filepath_or_buffer(
mode=mode,
)
- if not is_file_like(filepath_or_buffer):
+ # is_file_like requires (read | write) & __iter__ but __iter__ is only
+ # needed for read_csv(engine=python)
+ if not (
+ hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write")
+ ):
msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
raise ValueError(msg)
@@ -463,7 +489,7 @@ def get_compression_method(
def infer_compression(
- filepath_or_buffer: FilePathOrBuffer, compression: str | None
+ filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
) -> str | None:
"""
Get the compression method for filepath_or_buffer. If compression='infer',
@@ -538,7 +564,7 @@ def check_parent_directory(path: Path | str) -> None:
@overload
def get_handle(
- path_or_buf: FilePathOrBuffer,
+ path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
@@ -553,7 +579,7 @@ def get_handle(
@overload
def get_handle(
- path_or_buf: FilePathOrBuffer,
+ path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
@@ -567,7 +593,7 @@ def get_handle(
def get_handle(
- path_or_buf: FilePathOrBuffer,
+ path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = None,
@@ -649,7 +675,7 @@ def get_handle(
)
handle = ioargs.filepath_or_buffer
- handles: list[Buffer]
+ handles: list[BaseBuffer]
# memory mapping needs to be the first step
handle, memory_map, handles = _maybe_memory_map(
@@ -677,17 +703,18 @@ def get_handle(
if compression == "gzip":
if is_path:
assert isinstance(handle, str)
- handle = gzip.GzipFile(
+ # error: Incompatible types in assignment (expression has type
+ # "GzipFile", variable has type "Union[str, BaseBuffer]")
+ handle = gzip.GzipFile( # type: ignore[assignment]
filename=handle,
mode=ioargs.mode,
**compression_args,
)
else:
handle = gzip.GzipFile(
- # error: Argument "fileobj" to "GzipFile" has incompatible type
- # "Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
- # TextIOWrapper, mmap]]"; expected "Optional[IO[bytes]]"
- fileobj=handle, # type: ignore[arg-type]
+ # No overload variant of "GzipFile" matches argument types
+ # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
+ fileobj=handle, # type: ignore[call-overload]
mode=ioargs.mode,
**compression_args,
)
@@ -706,7 +733,12 @@ def get_handle(
# ZIP Compression
elif compression == "zip":
- handle = _BytesZipFile(handle, ioargs.mode, **compression_args)
+ # error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str,
+ # BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
+ # ReadBuffer[bytes], WriteBuffer[bytes]]"
+ handle = _BytesZipFile(
+ handle, ioargs.mode, **compression_args # type: ignore[arg-type]
+ )
if handle.mode == "r":
handles.append(handle)
zip_names = handle.namelist()
@@ -787,10 +819,14 @@ def get_handle(
assert not isinstance(ioargs.filepath_or_buffer, str)
handles.append(ioargs.filepath_or_buffer)
- assert not isinstance(handle, str)
return IOHandles(
- handle=handle,
- created_handles=handles,
+ # error: Argument "handle" to "IOHandles" has incompatible type
+ # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
+ # typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
+ handle=handle, # type: ignore[arg-type]
+ # error: Argument "created_handles" to "IOHandles" has incompatible type
+ # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
+ created_handles=handles, # type: ignore[arg-type]
is_wrapped=is_wrapped,
is_mmap=memory_map,
compression=ioargs.compression,
@@ -821,7 +857,7 @@ class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc]
# GH 17778
def __init__(
self,
- file: FilePathOrBuffer,
+ file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
mode: str,
archive_name: str | None = None,
**kwargs,
@@ -974,15 +1010,15 @@ def detach(self):
def _maybe_memory_map(
- handle: FileOrBuffer,
+ handle: str | BaseBuffer,
memory_map: bool,
encoding: str,
mode: str,
errors: str | None,
decode: bool,
-) -> tuple[FileOrBuffer, bool, list[Buffer]]:
+) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
"""Try to memory map file/buffer."""
- handles: list[Buffer] = []
+ handles: list[BaseBuffer] = []
memory_map &= hasattr(handle, "fileno") or isinstance(handle, str)
if not memory_map:
return handle, memory_map, handles
@@ -1001,10 +1037,11 @@ def _maybe_memory_map(
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
wrapped = cast(
- mmap.mmap,
+ BaseBuffer,
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
)
- handle.close()
+ # error: "BaseBuffer" has no attribute "close"
+ handle.close() # type: ignore[attr-defined]
handles.remove(handle)
handles.append(wrapped)
handle = wrapped
@@ -1018,7 +1055,7 @@ def _maybe_memory_map(
return handle, memory_map, handles
-def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool:
+def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool:
"""Test whether file exists."""
exists = False
filepath_or_buffer = stringify_path(filepath_or_buffer)
@@ -1032,7 +1069,7 @@ def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool:
return exists
-def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool:
+def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool:
"""Whether the handle is opened in binary mode"""
# specified by user
if "t" in mode or "b" in mode:
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 22fbaaaa8b2f8..04052b0fe9fdf 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -6,6 +6,7 @@
import os
from textwrap import fill
from typing import (
+ IO,
Any,
Mapping,
cast,
@@ -17,10 +18,11 @@
from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import (
- Buffer,
DtypeArg,
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteExcelBuffer,
)
from pandas.compat._optional import (
get_version,
@@ -816,7 +818,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
# ExcelWriter.
def __new__(
cls,
- path: FilePathOrBuffer | ExcelWriter,
+ path: FilePath | WriteExcelBuffer | ExcelWriter,
engine=None,
date_format=None,
datetime_format=None,
@@ -918,7 +920,7 @@ def save(self):
def __init__(
self,
- path: FilePathOrBuffer | ExcelWriter,
+ path: FilePath | WriteExcelBuffer | ExcelWriter,
engine=None,
date_format=None,
datetime_format=None,
@@ -942,7 +944,7 @@ def __init__(
# cast ExcelWriter to avoid adding 'if self.handles is not None'
self.handles = IOHandles(
- cast(Buffer[bytes], path), compression={"copression": None}
+ cast(IO[bytes], path), compression={"copression": None}
)
if not isinstance(path, ExcelWriter):
self.handles = get_handle(
@@ -1061,7 +1063,7 @@ def close(self):
@doc(storage_options=_shared_docs["storage_options"])
def inspect_excel_format(
- content_or_path: FilePathOrBuffer,
+ content_or_path: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
) -> str | None:
"""
@@ -1108,9 +1110,7 @@ def inspect_excel_format(
elif not peek.startswith(ZIP_SIGNATURE):
return None
- # ZipFile typing is overly-strict
- # https://github.com/python/typeshed/issues/4212
- zf = zipfile.ZipFile(stream) # type: ignore[arg-type]
+ zf = zipfile.ZipFile(stream)
# Workaround for some third party files that use forward slashes and
# lower case names.
diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
index e0c5a2c6a7ff9..952ad72b480b7 100644
--- a/pandas/io/excel/_odfreader.py
+++ b/pandas/io/excel/_odfreader.py
@@ -3,7 +3,8 @@
import numpy as np
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
Scalar,
StorageOptions,
)
@@ -28,7 +29,7 @@ class ODFReader(BaseExcelReader):
def __init__(
self,
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
):
import_optional_dependency("odf")
@@ -40,7 +41,7 @@ def _workbook_class(self):
return OpenDocument
- def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+ def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
from odf.opendocument import load
return load(filepath_or_buffer)
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
index d499f1a5ea89f..f34bf311e6ce7 100644
--- a/pandas/io/excel/_openpyxl.py
+++ b/pandas/io/excel/_openpyxl.py
@@ -9,7 +9,8 @@
import numpy as np
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
Scalar,
StorageOptions,
)
@@ -505,7 +506,7 @@ def write_cells(
class OpenpyxlReader(BaseExcelReader):
def __init__(
self,
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
) -> None:
"""
@@ -527,7 +528,7 @@ def _workbook_class(self):
return Workbook
- def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+ def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
from openpyxl import load_workbook
return load_workbook(
diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
index 4b2b9f7a3a678..9284cf917a48c 100644
--- a/pandas/io/excel/_pyxlsb.py
+++ b/pandas/io/excel/_pyxlsb.py
@@ -2,7 +2,8 @@
from __future__ import annotations
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
Scalar,
StorageOptions,
)
@@ -14,7 +15,7 @@
class PyxlsbReader(BaseExcelReader):
def __init__(
self,
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions = None,
):
"""
@@ -38,7 +39,7 @@ def _workbook_class(self):
return Workbook
- def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+ def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
from pyxlsb import open_workbook
# TODO: hack in buffer capability
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
index 145cbe182eadb..e4547b527a6b9 100644
--- a/pandas/io/feather_format.py
+++ b/pandas/io/feather_format.py
@@ -7,8 +7,10 @@
)
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
@@ -26,7 +28,7 @@
@doc(storage_options=generic._shared_docs["storage_options"])
def to_feather(
df: DataFrame,
- path: FilePathOrBuffer[bytes],
+ path: FilePath | WriteBuffer[bytes],
storage_options: StorageOptions = None,
**kwargs,
):
@@ -36,7 +38,7 @@ def to_feather(
Parameters
----------
df : DataFrame
- path : string file path, or file-like object
+ path : str, path object, or file-like object
{storage_options}
.. versionadded:: 1.2.0
@@ -93,7 +95,7 @@ def to_feather(
@doc(storage_options=generic._shared_docs["storage_options"])
def read_feather(
- path: FilePathOrBuffer[bytes],
+ path: FilePath | ReadBuffer[bytes],
columns: Sequence[Hashable] | None = None,
use_threads: bool = True,
storage_options: StorageOptions = None,
@@ -103,18 +105,11 @@ def read_feather(
Parameters
----------
- path : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.feather``.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
+ path : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``read()`` function. The string could be a URL.
+ Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ expected. A local file could be: ``file://localhost/path/to/table.feather``.
columns : sequence, default None
If not provided, all columns are read.
use_threads : bool, default True
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index f078975e4b85a..18228a93b5285 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -20,10 +20,11 @@
from pandas._libs import writers as libwriters
from pandas._typing import (
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
FloatFormatType,
IndexLabel,
StorageOptions,
+ WriteBuffer,
)
from pandas.core.dtypes.generic import (
@@ -48,7 +49,7 @@ class CSVFormatter:
def __init__(
self,
formatter: DataFrameFormatter,
- path_or_buf: FilePathOrBuffer[str] | FilePathOrBuffer[bytes] = "",
+ path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
sep: str = ",",
cols: Sequence[Hashable] | None = None,
index_label: IndexLabel | None = None,
@@ -57,7 +58,7 @@ def __init__(
errors: str = "strict",
compression: CompressionOptions = "infer",
quoting: int | None = None,
- line_terminator="\n",
+ line_terminator: str | None = "\n",
chunksize: int | None = None,
quotechar: str | None = '"',
date_format: str | None = None,
@@ -245,7 +246,7 @@ def save(self) -> None:
# Note: self.encoding is irrelevant here
self.writer = csvlib.writer(
- handles.handle, # type: ignore[arg-type]
+ handles.handle,
lineterminator=self.line_terminator,
delimiter=self.sep,
quoting=self.quoting,
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index ca53bfb7d5e08..616331bf80a44 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -19,7 +19,6 @@
IO,
TYPE_CHECKING,
Any,
- AnyStr,
Callable,
Hashable,
Iterable,
@@ -51,11 +50,12 @@
ColspaceArgType,
ColspaceType,
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
FloatFormatType,
FormattersType,
IndexLabel,
StorageOptions,
+ WriteBuffer,
)
from pandas.core.dtypes.common import (
@@ -1021,7 +1021,7 @@ def __init__(self, fmt: DataFrameFormatter):
def to_latex(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
column_format: str | None = None,
longtable: bool = False,
encoding: str | None = None,
@@ -1053,7 +1053,7 @@ def to_latex(
def to_html(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
classes: str | list | tuple | None = None,
notebook: bool = False,
@@ -1066,8 +1066,10 @@ def to_html(
Parameters
----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
+ buf : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a string ``write()`` function. If None, the result is
+ returned as a string.
encoding : str, default “utf-8”
Set character encoding.
classes : str or list-like
@@ -1102,7 +1104,7 @@ def to_html(
def to_string(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
line_width: int | None = None,
) -> str | None:
@@ -1111,8 +1113,10 @@ def to_string(
Parameters
----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
+ buf : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a string ``write()`` function. If None, the result is
+ returned as a string.
encoding: str, default “utf-8”
Set character encoding.
line_width : int, optional
@@ -1126,7 +1130,7 @@ def to_string(
def to_csv(
self,
- path_or_buf: FilePathOrBuffer[AnyStr] | None = None,
+ path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
encoding: str | None = None,
sep: str = ",",
columns: Sequence[Hashable] | None = None,
@@ -1186,7 +1190,7 @@ def to_csv(
def save_to_buffer(
string: str,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
encoding: str | None = None,
) -> str | None:
"""
@@ -1200,7 +1204,7 @@ def save_to_buffer(
@contextmanager
-def get_buffer(buf: FilePathOrBuffer[str] | None, encoding: str | None = None):
+def get_buffer(buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None):
"""
Context manager to open, yield and close buffer for filenames or Path-like
objects, otherwise yield buf unchanged.
@@ -2142,7 +2146,7 @@ def get_level_lengths(
return result
-def buffer_put_lines(buf: IO[str], lines: list[str]) -> None:
+def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
"""
Appends lines to a buffer.
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
index 64a59778a54f3..ddd2420731028 100644
--- a/pandas/io/formats/info.py
+++ b/pandas/io/formats/info.py
@@ -6,7 +6,6 @@
)
import sys
from typing import (
- IO,
TYPE_CHECKING,
Iterable,
Iterator,
@@ -16,7 +15,10 @@
from pandas._config import get_option
-from pandas._typing import Dtype
+from pandas._typing import (
+ Dtype,
+ WriteBuffer,
+)
from pandas.core.indexes.api import Index
@@ -171,7 +173,7 @@ def size_qualifier(self) -> str:
def render(
self,
*,
- buf: IO[str] | None,
+ buf: WriteBuffer[str] | None,
max_cols: int | None,
verbose: bool | None,
show_counts: bool | None,
@@ -287,7 +289,7 @@ def memory_usage_bytes(self) -> int:
def render(
self,
*,
- buf: IO[str] | None,
+ buf: WriteBuffer[str] | None,
max_cols: int | None,
verbose: bool | None,
show_counts: bool | None,
@@ -306,7 +308,7 @@ class InfoPrinterAbstract:
Class for printing dataframe or series info.
"""
- def to_buffer(self, buf: IO[str] | None = None) -> None:
+ def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
"""Save dataframe info into buffer."""
table_builder = self._create_table_builder()
lines = table_builder.get_lines()
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 40803ff14e357..b16e6e6366330 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -21,10 +21,11 @@
from pandas._typing import (
Axis,
- FilePathOrBuffer,
+ FilePath,
IndexLabel,
Level,
Scalar,
+ WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
@@ -464,7 +465,7 @@ def to_excel(
def to_latex(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
*,
column_format: str | None = None,
position: str | None = None,
@@ -488,8 +489,10 @@ def to_latex(
Parameters
----------
- buf : str, Path, or StringIO-like, optional, default None
- Buffer to write to. If `None`, the output is returned as a string.
+ buf : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a string ``write()`` function. If None, the result is
+ returned as a string.
column_format : str, optional
The LaTeX column specification placed in location:
@@ -893,7 +896,7 @@ def to_latex(
def to_html(
self,
- buf: FilePathOrBuffer[str] | None = None,
+ buf: FilePath | WriteBuffer[str] | None = None,
*,
table_uuid: str | None = None,
table_attributes: str | None = None,
@@ -915,8 +918,10 @@ def to_html(
Parameters
----------
- buf : str, Path, or StringIO-like, optional, default None
- Buffer to write to. If ``None``, the output is returned as a string.
+ buf : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a string ``write()`` function. If None, the result is
+ returned as a string.
table_uuid : str, optional
Id attribute assigned to the
HTML element in the format:
diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index ea7d1dfa1645e..b997cd9bddd1e 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -9,8 +9,10 @@
from pandas._typing import (
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.errors import AbstractMethodError
@@ -90,7 +92,7 @@ class BaseXMLFormatter:
def __init__(
self,
frame: DataFrame,
- path_or_buffer: FilePathOrBuffer | None = None,
+ path_or_buffer: FilePath | WriteBuffer[bytes] | None = None,
index: bool | None = True,
root_name: str | None = "data",
row_name: str | None = "row",
@@ -102,7 +104,7 @@ def __init__(
encoding: str = "utf-8",
xml_declaration: bool | None = True,
pretty_print: bool | None = True,
- stylesheet: FilePathOrBuffer | None = None,
+ stylesheet: FilePath | ReadBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
) -> None:
@@ -272,7 +274,7 @@ def write_output(self) -> str | None:
storage_options=self.storage_options,
is_text=False,
) as handles:
- handles.handle.write(xml_doc) # type: ignore[arg-type]
+ handles.handle.write(xml_doc)
return None
else:
@@ -582,7 +584,6 @@ def transform_doc(self) -> bytes:
conditionally by its specific object type, then transforms
original tree with XSLT script.
"""
-
from lxml.etree import (
XSLT,
XMLParser,
@@ -591,6 +592,7 @@ def transform_doc(self) -> bytes:
)
style_doc = self.stylesheet
+ assert style_doc is not None # is ensured by caller
handle_data = get_data_from_filepath(
filepath_or_buffer=style_doc,
diff --git a/pandas/io/html.py b/pandas/io/html.py
index cffe910f1c8ff..7985dcbec9672 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -14,7 +14,10 @@
Sequence,
)
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
AbstractMethodError,
@@ -119,18 +122,21 @@ def _get_skiprows(skiprows: int | Sequence[int] | slice | None):
raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
-def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes:
+def _read(
+ obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None
+) -> str | bytes:
"""
Try to read from a url, file or string.
Parameters
----------
- obj : str, unicode, or file-like
+ obj : str, unicode, path object, or file-like object
Returns
-------
raw_text : str
"""
+ text: str | bytes
if (
is_url(obj)
or hasattr(obj, "read")
@@ -148,9 +154,7 @@ def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes:
text = obj
else:
raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
- # error: Incompatible return value type (got "Union[Any, bytes, None, str]",
- # expected "Union[str, bytes]")
- return text # type: ignore[return-value]
+ return text
class _HtmlFrameParser:
@@ -211,7 +215,7 @@ class _HtmlFrameParser:
def __init__(
self,
- io: FilePathOrBuffer,
+ io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
match: str | Pattern,
attrs: dict[str, str] | None,
encoding: str,
@@ -944,7 +948,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
@deprecate_nonkeyword_arguments(version="2.0")
def read_html(
- io: FilePathOrBuffer,
+ io: FilePath | ReadBuffer[str],
match: str | Pattern = ".+",
flavor: str | None = None,
header: int | Sequence[int] | None = None,
@@ -965,8 +969,10 @@ def read_html(
Parameters
----------
- io : str, path object or file-like object
- A URL, a file-like object, or a raw string containing HTML. Note that
+ io : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a string ``read()`` function.
+ The string can represent a URL or the HTML itself. Note that
lxml only accepts the http, ftp and file url protocols. If you have a
URL that starts with ``'https'`` you might try removing the ``'s'``.
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 6bdb4df806b5c..6dd4de597c29d 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -3,7 +3,10 @@
from typing import TYPE_CHECKING
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.compat._optional import import_optional_dependency
from pandas.io.common import get_handle
@@ -13,7 +16,7 @@
def read_orc(
- path: FilePathOrBuffer, columns: list[str] | None = None, **kwargs
+ path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs
) -> DataFrame:
"""
Load an ORC object from the file path, returning a DataFrame.
@@ -22,18 +25,12 @@ def read_orc(
Parameters
----------
- path : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ path : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``read()`` function. The string could be a URL.
+ Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.orc``.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
columns : list, default None
If not None, only these columns will be read from the file.
**kwargs
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 2eb1dd2d44d65..c4b9e36472092 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -3,15 +3,14 @@
import io
import os
-from typing import (
- Any,
- AnyStr,
-)
+from typing import Any
from warnings import catch_warnings
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
@@ -69,12 +68,14 @@ def get_engine(engine: str) -> BaseImpl:
def _get_path_or_handle(
- path: FilePathOrBuffer,
+ path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
fs: Any,
storage_options: StorageOptions = None,
mode: str = "rb",
is_dir: bool = False,
-) -> tuple[FilePathOrBuffer, IOHandles[bytes] | None, Any]:
+) -> tuple[
+ FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
+]:
"""File handling for PyArrow."""
path_or_handle = stringify_path(path)
if is_fsspec_url(path_or_handle) and fs is None:
@@ -157,7 +158,7 @@ def __init__(self):
def write(
self,
df: DataFrame,
- path: FilePathOrBuffer[AnyStr],
+ path: FilePath | WriteBuffer[bytes],
compression: str | None = "snappy",
index: bool | None = None,
storage_options: StorageOptions = None,
@@ -353,7 +354,7 @@ def read(
@doc(storage_options=generic._shared_docs["storage_options"])
def to_parquet(
df: DataFrame,
- path: FilePathOrBuffer | None = None,
+ path: FilePath | WriteBuffer[bytes] | None = None,
engine: str = "auto",
compression: str | None = "snappy",
index: bool | None = None,
@@ -367,13 +368,12 @@ def to_parquet(
Parameters
----------
df : DataFrame
- path : str or file-like object, default None
- If a string, it will be used as Root Directory path
- when writing a partitioned dataset. By file-like object,
- we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function) or io.BytesIO. The engine
- fastparquet does not accept file-like objects. If path is None,
- a bytes object is returned.
+ path : str, path object, file-like object, or None, default None
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function. If None, the result is
+ returned as bytes. If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. The engine fastparquet does not
+ accept file-like objects.
.. versionchanged:: 1.2.0
@@ -415,7 +415,7 @@ def to_parquet(
partition_cols = [partition_cols]
impl = get_engine(engine)
- path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path
+ path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
impl.write(
df,
@@ -449,21 +449,15 @@ def read_parquet(
Parameters
----------
path : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
- expected. A local file could be:
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``read()`` function.
+ The string could be a URL. Valid URL schemes include http, ftp, s3,
+ gs, and file. For file URLs, a host is expected. A local file could be:
``file://localhost/path/to/table.parquet``.
A file URL can also be a path to a directory that contains multiple
partitioned parquet files. Both pyarrow and fastparquet support
paths to directories as well as file URLs. A directory path could be:
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 5b1b178c4f610..9fbeeb74901ef 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -1,6 +1,9 @@
from __future__ import annotations
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.inference import is_integer
@@ -16,7 +19,7 @@ class ArrowParserWrapper(ParserBase):
Wrapper for the pyarrow engine for read_csv()
"""
- def __init__(self, src: FilePathOrBuffer, **kwds):
+ def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds):
self.kwds = kwds
self.src = src
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 42b9c8c9f10fe..d096e9008112b 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -26,7 +26,8 @@
from pandas._typing import (
ArrayLike,
DtypeArg,
- FilePathOrBuffer,
+ FilePath,
+ ReadCsvBuffer,
)
from pandas.errors import (
ParserError,
@@ -218,7 +219,11 @@ def __init__(self, kwds):
# Normally, this arg would get pre-processed earlier on
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
- def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None:
+ def _open_handles(
+ self,
+ src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ kwds: dict[str, Any],
+ ) -> None:
"""
Let the readers open IOHandles after they are done with their potential raises.
"""
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index db750cded45e5..e96df3b3f3782 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -7,7 +7,8 @@
import pandas._libs.parsers as parsers
from pandas._typing import (
ArrayLike,
- FilePathOrBuffer,
+ FilePath,
+ ReadCsvBuffer,
)
from pandas.errors import DtypeWarning
from pandas.util._exceptions import find_stack_level
@@ -31,7 +32,9 @@ class CParserWrapper(ParserBase):
low_memory: bool
_reader: parsers.TextReader
- def __init__(self, src: FilePathOrBuffer, **kwds):
+ def __init__(
+ self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
+ ):
self.kwds = kwds
kwds = kwds.copy()
ParserBase.__init__(self, kwds)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 36387f0835f4a..f5420618c0235 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -20,7 +20,8 @@
import pandas._libs.lib as lib
from pandas._typing import (
- FilePathOrBuffer,
+ FilePath,
+ ReadCsvBuffer,
Scalar,
)
from pandas.errors import (
@@ -45,7 +46,9 @@
class PythonParser(ParserBase):
- def __init__(self, f: FilePathOrBuffer | list, **kwds):
+ def __init__(
+ self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, **kwds
+ ):
"""
Workhorse function for processing nested list into DataFrame
"""
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 0b57f0f5ef814..9f555d77948a7 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -17,7 +17,8 @@
from pandas._typing import (
ArrayLike,
DtypeArg,
- FilePathOrBuffer,
+ FilePath,
+ ReadCsvBuffer,
StorageOptions,
)
from pandas.errors import (
@@ -505,7 +506,9 @@ def _validate_names(names):
raise ValueError("Names should be an ordered collection.")
-def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
+def _read(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
+):
"""Generic reader of line files."""
if kwds.get("date_parser", None) is not None:
if isinstance(kwds["parse_dates"], bool):
@@ -554,7 +557,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
)
)
def read_csv(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
@@ -652,7 +655,7 @@ def read_csv(
)
)
def read_table(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
@@ -739,7 +742,7 @@ def read_table(
def read_fwf(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
colspecs="infer",
widths=None,
infer_nrows=100,
@@ -756,18 +759,12 @@ def read_fwf(
Parameters
----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ filepath_or_buffer : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a text ``read()`` function.The string could be a URL.
+ Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.csv``.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
colspecs : list of tuple (int, int) or 'infer'. optional
A list of tuples giving the extents of the fixed-width
fields of each line as half-open intervals (i.e., [from, to[ ).
@@ -942,10 +939,10 @@ def _get_options_with_defaults(self, engine):
def _check_file_or_buffer(self, f, engine):
# see gh-16530
- if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"):
- # The C engine doesn't need the file-like to have the "__next__"
- # attribute. However, the Python engine explicitly calls
- # "__next__(...)" when iterating through such an object, meaning it
+ if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
+ # The C engine doesn't need the file-like to have the "__iter__"
+ # attribute. However, the Python engine needs "__iter__(...)"
+ # when iterating through such an object, meaning it
# needs to have that attribute
raise ValueError(
"The 'python' engine cannot iterate through this file buffer."
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 6a91c12ee286e..8bd0942550e6e 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -1,12 +1,16 @@
""" pickle compat """
+from __future__ import annotations
+
import pickle
from typing import Any
import warnings
from pandas._typing import (
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
+ ReadPickleBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.compat import pickle_compat as pc
from pandas.util._decorators import doc
@@ -19,7 +23,7 @@
@doc(storage_options=generic._shared_docs["storage_options"])
def to_pickle(
obj: Any,
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | WriteBuffer[bytes],
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
@@ -31,8 +35,9 @@ def to_pickle(
----------
obj : any object
Any python object.
- filepath_or_buffer : str, path object or file-like object
- File path, URL, or buffer where the pickled object will be stored.
+ filepath_or_buffer : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function.
.. versionchanged:: 1.0.0
Accept URL. URL has to be of S3 or GCS.
@@ -103,26 +108,15 @@ def to_pickle(
# pickle create the entire object and then write it to the buffer.
# "zip" would also be here if pandas.io.common._BytesZipFile
# wouldn't buffer write calls
- handles.handle.write(
- # error: Argument 1 to "write" of "TextIOBase" has incompatible type
- # "bytes"; expected "str"
- pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type]
- )
+ handles.handle.write(pickle.dumps(obj, protocol=protocol))
else:
# letting pickle write directly to the buffer is more memory-efficient
- pickle.dump(
- # error: Argument 2 to "dump" has incompatible type "Union[IO[Any],
- # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]"; expected
- # "IO[bytes]"
- obj,
- handles.handle, # type: ignore[arg-type]
- protocol=protocol,
- )
+ pickle.dump(obj, handles.handle, protocol=protocol)
@doc(storage_options=generic._shared_docs["storage_options"])
def read_pickle(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadPickleBuffer,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
):
@@ -136,8 +130,9 @@ def read_pickle(
Parameters
----------
- filepath_or_buffer : str, path object or file-like object
- File path, URL, or buffer where the pickled object will be loaded from.
+ filepath_or_buffer : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``readlines()`` function.
.. versionchanged:: 1.0.0
Accept URL. URL is not limited to S3 and GCS.
@@ -211,10 +206,7 @@ def read_pickle(
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
- # error: Argument 1 to "load" has incompatible type "Union[IO[Any],
- # RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]";
- # expected "IO[bytes]"
- return pickle.load(handles.handle) # type: ignore[arg-type]
+ return pickle.load(handles.handle)
except excs_to_catch:
# e.g.
# "No module named 'pandas.core.sparse.series'"
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
index 300df9728cd75..cd863cabf5c2d 100644
--- a/pandas/io/sas/sas7bdat.py
+++ b/pandas/io/sas/sas7bdat.py
@@ -21,14 +21,14 @@
timedelta,
)
import struct
-from typing import (
- IO,
- Any,
- cast,
-)
+from typing import cast
import numpy as np
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.errors import (
EmptyDataError,
OutOfBoundsDatetime,
@@ -159,7 +159,7 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
def __init__(
self,
- path_or_buf,
+ path_or_buf: FilePath | ReadBuffer[bytes],
index=None,
convert_dates=True,
blank_missing=True,
@@ -179,16 +179,16 @@ def __init__(
self.default_encoding = "latin-1"
self.compression = b""
- self.column_names_strings = []
- self.column_names = []
- self.column_formats = []
- self.columns = []
+ self.column_names_strings: list[str] = []
+ self.column_names: list[str] = []
+ self.column_formats: list[str] = []
+ self.columns: list[_Column] = []
- self._current_page_data_subheader_pointers = []
+ self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
self._cached_page = None
- self._column_data_lengths = []
- self._column_data_offsets = []
- self._column_types = []
+ self._column_data_lengths: list[int] = []
+ self._column_data_offsets: list[int] = []
+ self._column_types: list[bytes] = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
@@ -196,7 +196,7 @@ def __init__(
self.handles = get_handle(path_or_buf, "rb", is_text=False)
- self._path_or_buf = cast(IO[Any], self.handles.handle)
+ self._path_or_buf = self.handles.handle
try:
self._get_properties()
@@ -227,7 +227,7 @@ def _get_properties(self) -> None:
# Check magic number
self._path_or_buf.seek(0)
- self._cached_page = cast(bytes, self._path_or_buf.read(288))
+ self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0 : len(const.magic)] != const.magic:
raise ValueError("magic number mismatch (not a SAS file?)")
@@ -301,7 +301,7 @@ def _get_properties(self) -> None:
)
# Read the rest of the header into cached_page.
- buf = cast(bytes, self._path_or_buf.read(self.header_length - 288))
+ buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
# error: Argument 1 to "len" has incompatible type "Optional[bytes]";
# expected "Sized"
@@ -400,7 +400,7 @@ def _read_bytes(self, offset: int, length: int):
def _parse_metadata(self) -> None:
done = False
while not done:
- self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
+ self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
@@ -761,7 +761,7 @@ def read(self, nrows: int | None = None) -> DataFrame | None:
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
- self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
+ self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
@@ -817,7 +817,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
js += 1
else:
self.close()
- raise ValueError(f"unknown column type {self._column_types[j]}")
+ raise ValueError(f"unknown column type {repr(self._column_types[j])}")
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
return df
diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
index 3f9bf6662e99f..d8a3412e05d05 100644
--- a/pandas/io/sas/sas_xport.py
+++ b/pandas/io/sas/sas_xport.py
@@ -7,17 +7,19 @@
https://support.sas.com/techsup/technote/ts140.pdf
"""
+from __future__ import annotations
+
from collections import abc
from datetime import datetime
import struct
-from typing import (
- IO,
- cast,
-)
import warnings
import numpy as np
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.util._decorators import Appender
import pandas as pd
@@ -248,7 +250,11 @@ class XportReader(ReaderBase, abc.Iterator):
__doc__ = _xport_reader_doc
def __init__(
- self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None
+ self,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
+ index=None,
+ encoding: str | None = "ISO-8859-1",
+ chunksize=None,
):
self._encoding = encoding
@@ -259,7 +265,7 @@ def __init__(
self.handles = get_handle(
filepath_or_buffer, "rb", encoding=encoding, is_text=False
)
- self.filepath_or_buffer = cast(IO[bytes], self.handles.handle)
+ self.filepath_or_buffer = self.handles.handle
try:
self._read_header()
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
index b323ce39763a1..f50fc777f55e9 100644
--- a/pandas/io/sas/sasreader.py
+++ b/pandas/io/sas/sasreader.py
@@ -13,7 +13,10 @@
overload,
)
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+ FilePath,
+ ReadBuffer,
+)
from pandas.io.common import stringify_path
@@ -44,7 +47,7 @@ def __exit__(self, exc_type, exc_value, traceback):
@overload
def read_sas(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
format: str | None = ...,
index: Hashable | None = ...,
encoding: str | None = ...,
@@ -56,7 +59,7 @@ def read_sas(
@overload
def read_sas(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
format: str | None = ...,
index: Hashable | None = ...,
encoding: str | None = ...,
@@ -67,7 +70,7 @@ def read_sas(
def read_sas(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
format: str | None = None,
index: Hashable | None = None,
encoding: str | None = None,
@@ -79,18 +82,12 @@ def read_sas(
Parameters
----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ filepath_or_buffer : str, path object, or file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``read()`` function. The string could be a URL.
+ Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.sas``.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
format : str {'xport', 'sas7bdat'} or None
If None, file format is inferred from file extension. If 'xport' or
'sas7bdat', uses the corresponding format.
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 013f17580600d..ff9d8a1be3d1e 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -18,6 +18,7 @@
import struct
import sys
from typing import (
+ IO,
TYPE_CHECKING,
Any,
AnyStr,
@@ -33,10 +34,11 @@
from pandas._libs.lib import infer_dtype
from pandas._libs.writers import max_len_string_array
from pandas._typing import (
- Buffer,
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
+ WriteBuffer,
)
from pandas.util._decorators import (
Appender,
@@ -1117,7 +1119,7 @@ class StataReader(StataParser, abc.Iterator):
def __init__(
self,
- path_or_buf: FilePathOrBuffer,
+ path_or_buf: FilePath | ReadBuffer[bytes],
convert_dates: bool = True,
convert_categoricals: bool = True,
index_col: str | None = None,
@@ -1168,10 +1170,7 @@ def __init__(
compression=compression,
) as handles:
# Copy to BytesIO, and ensure no encoding
-
- # Argument 1 to "BytesIO" has incompatible type "Union[Any, bytes, None,
- # str]"; expected "bytes"
- self.path_or_buf = BytesIO(handles.handle.read()) # type: ignore[arg-type]
+ self.path_or_buf = BytesIO(handles.handle.read())
self._read_header()
self._setup_dtype()
@@ -2002,7 +2001,7 @@ def value_labels(self) -> dict[str, dict[float | int, str]]:
@Appender(_read_stata_doc)
def read_stata(
- filepath_or_buffer: FilePathOrBuffer,
+ filepath_or_buffer: FilePath | ReadBuffer[bytes],
convert_dates: bool = True,
convert_categoricals: bool = True,
index_col: str | None = None,
@@ -2270,7 +2269,7 @@ class StataWriter(StataParser):
def __init__(
self,
- fname: FilePathOrBuffer,
+ fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = None,
write_index: bool = True,
@@ -2294,7 +2293,7 @@ def __init__(
self._value_labels: list[StataValueLabel] = []
self._has_value_labels = np.array([], dtype=bool)
self._compression = compression
- self._output_file: Buffer[bytes] | None = None
+ self._output_file: IO[bytes] | None = None
self._converted_names: dict[Hashable, str] = {}
# attach nobs, nvars, data, varlist, typlist
self._prepare_pandas(data)
@@ -2310,15 +2309,13 @@ def _write(self, to_write: str) -> None:
"""
Helper to call encode before writing to file for Python 3 compat.
"""
- self.handles.handle.write(
- to_write.encode(self._encoding) # type: ignore[arg-type]
- )
+ self.handles.handle.write(to_write.encode(self._encoding))
def _write_bytes(self, value: bytes) -> None:
"""
Helper to assert file is open before writing.
"""
- self.handles.handle.write(value) # type: ignore[arg-type]
+ self.handles.handle.write(value)
def _prepare_non_cat_value_labels(
self, data: DataFrame
@@ -2686,7 +2683,7 @@ def _close(self) -> None:
if self._output_file is not None:
assert isinstance(self.handles.handle, BytesIO)
bio, self.handles.handle = self.handles.handle, self._output_file
- self.handles.handle.write(bio.getvalue()) # type: ignore[arg-type]
+ self.handles.handle.write(bio.getvalue())
def _write_map(self) -> None:
"""No-op, future compatibility"""
@@ -3203,7 +3200,7 @@ class StataWriter117(StataWriter):
def __init__(
self,
- fname: FilePathOrBuffer,
+ fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = None,
write_index: bool = True,
@@ -3605,7 +3602,7 @@ class StataWriterUTF8(StataWriter117):
def __init__(
self,
- fname: FilePathOrBuffer,
+ fname: FilePath | WriteBuffer[bytes],
data: DataFrame,
convert_dates: dict[Hashable, str] | None = None,
write_index: bool = True,
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index bc3436861f1a8..3c3b4afa2c57d 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -7,9 +7,9 @@
import io
from pandas._typing import (
- Buffer,
CompressionOptions,
- FilePathOrBuffer,
+ FilePath,
+ ReadBuffer,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
@@ -199,9 +199,6 @@ class _EtreeFrameParser(_XMLFrameParser):
standard library XML module: `xml.etree.ElementTree`.
"""
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
-
def parse_data(self) -> list[dict[str, str | None]]:
from xml.etree.ElementTree import XML
@@ -571,11 +568,11 @@ def _transform_doc(self) -> bytes:
def get_data_from_filepath(
- filepath_or_buffer,
+ filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
encoding,
compression,
storage_options,
-) -> str | bytes | Buffer:
+) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
"""
Extract raw XML data.
@@ -587,7 +584,8 @@ def get_data_from_filepath(
This method turns (1) into (2) to simplify the rest of the processing.
It returns input types (2) and (3) unchanged.
"""
- filepath_or_buffer = stringify_path(filepath_or_buffer)
+ if not isinstance(filepath_or_buffer, bytes):
+ filepath_or_buffer = stringify_path(filepath_or_buffer)
if (
isinstance(filepath_or_buffer, str)
@@ -606,7 +604,10 @@ def get_data_from_filepath(
storage_options=storage_options,
) as handle_obj:
filepath_or_buffer = (
- handle_obj.handle.read()
+ # error: Incompatible types in assignment (expression has type
+ # "Union[str, IO[str]]", variable has type "Union[Union[str,
+ # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
+ handle_obj.handle.read() # type: ignore[assignment]
if hasattr(handle_obj.handle, "read")
else handle_obj.handle
)
@@ -728,7 +729,7 @@ def _parse(
@doc(storage_options=_shared_docs["storage_options"])
def read_xml(
- path_or_buffer: FilePathOrBuffer,
+ path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str | None = "./*",
namespaces: dict | list[dict] | None = None,
elems_only: bool | None = False,
@@ -736,7 +737,7 @@ def read_xml(
names: list[str] | None = None,
encoding: str | None = "utf-8",
parser: str | None = "lxml",
- stylesheet: FilePathOrBuffer | None = None,
+ stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
) -> DataFrame:
@@ -748,8 +749,10 @@ def read_xml(
Parameters
----------
path_or_buffer : str, path object, or file-like object
- Any valid XML string or path is acceptable. The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file.
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a ``read()`` function. The string can be any valid XML
+ string or a path. The string can further be a URL. Valid URL schemes
+ include http, ftp, s3, and file.
xpath : str, optional, default './\*'
The XPath to parse required set of nodes for migration to DataFrame.
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index f62c9fd1349bf..df8be721ec871 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -310,3 +310,22 @@ def test_malformed_skipfooter(python_parser_only):
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
+
+
+def test_python_engine_file_no_next(python_parser_only):
+ parser = python_parser_only
+
+ class NoNextBuffer:
+ def __init__(self, csv_data):
+ self.data = csv_data
+
+ def __iter__(self):
+ return self.data.__iter__()
+
+ def read(self):
+ return self.data
+
+ def readline(self):
+ return self.data
+
+ parser.read_csv(NoNextBuffer("a\n1"))
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 1e5cf49ce24ae..89d35499fd597 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -104,22 +104,25 @@ def test_python_engine(self, python_engine):
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
- def test_python_engine_file_no_next(self, python_engine):
+ def test_python_engine_file_no_iter(self, python_engine):
# see gh-16530
class NoNextBuffer:
def __init__(self, csv_data):
self.data = csv_data
- def __iter__(self):
- return self
+ def __next__(self):
+ return self.data.__next__()
def read(self):
return self.data
+ def readline(self):
+ return self.data
+
data = "a\n1"
- msg = "The 'python' engine cannot iterate"
+ msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
- with pytest.raises(ValueError, match=msg):
+ with pytest.raises(TypeError, match=msg):
read_csv(NoNextBuffer(data), engine=python_engine)
def test_pyarrow_engine(self):
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index c257b61db296e..b8d146c597d2c 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -1287,8 +1287,7 @@ def test_compression_output(parser, comp):
output = equalize_decl(output)
- # error: Item "None" of "Union[str, bytes, None]" has no attribute "strip"
- assert geom_xml == output.strip() # type: ignore[union-attr]
+ assert geom_xml == output.strip()
@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"])
@@ -1306,8 +1305,7 @@ def test_filename_and_suffix_comp(parser, comp, compfile):
output = equalize_decl(output)
- # error: Item "None" of "Union[str, bytes, None]" has no attribute "strip"
- assert geom_xml == output.strip() # type: ignore[union-attr]
+ assert geom_xml == output.strip()
def test_unsuported_compression(datapath, parser):