Skip to content

ENH: Synchronize pickle with upstream #206

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 5, 2022
Merged
3 changes: 3 additions & 0 deletions pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class BaseBuffer(Protocol): ...
class ReadBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...
class WriteBuffer(BaseBuffer, Protocol[AnyStr_cov]): ...

class ReadPickleBuffer(ReadBuffer[bytes], Protocol):
def readline(self) -> AnyStr_cov: ...

FilePath = Union[str, PathLike[str]]

Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
Expand Down
9 changes: 7 additions & 2 deletions pandas-stubs/core/frame.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ from pandas._typing import (
Axes,
Axis,
AxisType,
CompressionOptions,
Dtype,
DtypeNp,
FilePath,
FilePathOrBuffer,
FilePathOrBytesBuffer,
GroupByObjectNonScalar,
Expand All @@ -67,8 +69,10 @@ from pandas._typing import (
Scalar,
ScalarT,
SeriesAxisType,
StorageOptions,
StrLike,
T as TType,
WriteBuffer,
np_ndarray_bool,
np_ndarray_str,
num,
Expand Down Expand Up @@ -2017,9 +2021,10 @@ class DataFrame(NDFrame, OpsMixin):
) -> _str: ...
def to_pickle(
self,
path: _str,
compression: _str | Literal["infer", "gzip", "bz2", "zip", "xz"] = ...,
path: FilePath | WriteBuffer[bytes],
compression: CompressionOptions = ...,
protocol: int = ...,
storage_options: StorageOptions = ...,
) -> None: ...
def to_sql(
self,
Expand Down
9 changes: 7 additions & 2 deletions pandas-stubs/core/generic.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,19 @@ from pandas._typing import (
S1,
ArrayLike,
Axis,
CompressionOptions,
Dtype,
FilePath,
FilePathOrBuffer,
FrameOrSeries,
FrameOrSeriesUnion,
IgnoreRaise,
Level,
Scalar,
SeriesAxisType,
StorageOptions,
T,
WriteBuffer,
)

_bool = bool
Expand Down Expand Up @@ -160,9 +164,10 @@ class NDFrame(PandasObject, indexing.IndexingMixin):
) -> None: ...
def to_pickle(
self,
path: _str,
compression: _str | Literal["infer", "gzip", "bz2", "zip", "xz"] = ...,
path: FilePath | WriteBuffer[bytes],
compression: CompressionOptions = ...,
protocol: int = ...,
storage_options: StorageOptions = ...,
) -> None: ...
def to_clipboard(
self, excel: _bool = ..., sep: _str | None = ..., **kwargs
Expand Down
26 changes: 17 additions & 9 deletions pandas-stubs/io/pickle.pyi
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from typing import Literal
from typing import Any

from pandas._typing import FilePathOrBuffer
from pandas._typing import (
CompressionOptions,
FilePath,
ReadPickleBuffer,
StorageOptions,
WriteBuffer,
)

def to_pickle(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since pandas.io.pickle.to_pickle() is not public, we should delete this here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still need to handle removal of to_pickle() from this file (and then the associated tests)

obj,
filepath_or_buffer: FilePathOrBuffer,
compression: str | None = ...,
obj: object,
filepath_or_buffer: FilePath | WriteBuffer[bytes],
compression: CompressionOptions = ...,
protocol: int = ...,
): ...
storage_options: StorageOptions = ...,
) -> None: ...
def read_pickle(
filepath_or_buffer_or_reader: FilePathOrBuffer,
compression: str | Literal["infer", "gzip", "bz2", "zip", "xz"] | None = ...,
): ...
filepath_or_buffer: FilePath | ReadPickleBuffer,
compression: CompressionOptions = ...,
storage_options: StorageOptions = ...,
) -> Any: ...
72 changes: 72 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import tempfile
from typing import Any

from pandas import DataFrame
from typing_extensions import assert_type

from tests import check

from pandas.io.pickle import (
read_pickle,
to_pickle,
)

DF = DataFrame({"a": [1, 2, 3], "b": [0.0, 0.0, 0.0]})


def test_pickle():
with tempfile.NamedTemporaryFile(delete=False) as file:
check(assert_type(DF.to_pickle(file), None), type(None))
file.seek(0)
check(assert_type(read_pickle(file.name), Any), DataFrame)
file.close()
check(assert_type(read_pickle(file.name), Any), DataFrame)
os.unlink(file.name)

with tempfile.NamedTemporaryFile(delete=False) as file:
check(assert_type(to_pickle(DF, file), None), type(None))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since to_pickle() is not public, no need to test it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is listed in pandas/io/api.py so I assume this makes it public even it not on the docs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is listed in pandas/io/api.py so I assume this makes it public even it not on the docs.

There's a few schools of thought here:

  1. We trim down pandas-stubs so that it only type checks what is in the documented public API
  2. If a function or class is "public" in the sense that it does not begin with an underscore but is not documented, we create a stub for it and test it.
  3. If a function or class is "public" in the sense that it does not begin with an underscore and is not documented, we are agnostic on creating a stub or testing that stub.

So far, @twoertwein and I have been leaning towards (1). With pandas.io.api.to_pickle(), you are proposing (2).

@twoertwein What are your thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is more of what is the "API". To me it is something like

  • Documented as part of the docs
  • Imported into an api.py file or the top level `init.py file.

I think 2 is too broad because there hasn't been enough effort in pandas to _ modules, classes and methods.

In short, if it seems to be part of an API, then it is reasonable to include it.

A related point is documenting public methods of classes that appear parrtially in the docs, something like Klass.method(arg1, arg2). Should public of Klass be documented, or just Klass.__init__ and Klass.method

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My goal is that everything that is meant to be public (which is often unclear) is documented and in pandas-stubs. Personally, I think the best way is to remove any symbol from the stubs that is not meant to be public.

  • If it seems reasonable that it is meant to be public, it might be a better user experience, if we first open an issue at pandas before potentially removing it.
  • If we remove too much, we get user feedback and can then create an issue at pandas.

I believe typeshed uses # undocumented to indicate which parts of their stubs are technically not documented. if we want to keep more in here, we could follow that approach. Luckily we have rather good connections to pandas :) so we might just open an issue there :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some grey zones: a private super class but the inherited methods are public in a public child class: I would keep the parent class (in the long-term, I would like if pandas-stubs aligns with pandas), define __all__ but exclude the class from it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is more of what is the "API". To me it is something like

  • Documented as part of the docs
  • Imported into an api.py file or the top level `init.py file.

I think that's a fair definition. So using that, then to_pickle() is public, but it appears to be undocumented, so someone should create an issue over in pandas repo to indicate that it should be documented. In that case, we'll get a reaction of "wait - that shouldn't be public", or "Yes, let's document".

Looking at the source pandas.to_pickle() is what is really public, so I think you should change the tests to use pandas.to_pickle() rather than pandas.io.api.to_pickle() . Same goes for pandas.read_pickle() versus pandas.io.api.read_pickle()

file.seek(0)
check(assert_type(read_pickle(file.name), Any), DataFrame)
file.close()
check(assert_type(read_pickle(file.name), Any), DataFrame)
os.unlink(file.name)


def test_pickle_protocol():
with tempfile.NamedTemporaryFile(delete=False) as file:
DF.to_pickle(file, protocol=3)
file.seek(0)
check(assert_type(read_pickle(file.name), Any), DataFrame)
file.close()
check(assert_type(read_pickle(file.name), Any), DataFrame)
os.unlink(file.name)


def test_pickle_compression():
with tempfile.NamedTemporaryFile(delete=False) as file:
DF.to_pickle(file, compression="gzip")
file.seek(0)
check(
assert_type(read_pickle(file.name, compression="gzip"), Any),
DataFrame,
)
file.close()
check(
assert_type(read_pickle(file.name, compression="gzip"), Any),
DataFrame,
)
os.unlink(file.name)


def test_pickle_storage_options():
with tempfile.NamedTemporaryFile(delete=False) as file:
DF.to_pickle(file, storage_options={})
file.seek(0)
check(assert_type(read_pickle(file, storage_options={}), Any), DataFrame)
file.close()
check(
assert_type(read_pickle(file.name, storage_options={}), Any),
DataFrame,
)
os.unlink(file.name)