From 465721cdd2f0730447e5661b0ff2c26149e5612b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 19 Mar 2020 10:22:37 +0000 Subject: [PATCH 01/10] wip --- pandas/core/frame.py | 10 +- pandas/io/formats/format.py | 3 +- pandas/protocol/__init__.py | 0 pandas/protocol/wrapper.py | 58 ++++++++ pandas/wesm/__init__.py | 0 pandas/wesm/dataframe.py | 270 ++++++++++++++++++++++++++++++++++++ 6 files changed, 339 insertions(+), 2 deletions(-) create mode 100644 pandas/protocol/__init__.py create mode 100644 pandas/protocol/wrapper.py create mode 100644 pandas/wesm/__init__.py create mode 100644 pandas/wesm/dataframe.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index baa6fb07ff233..91387424d7860 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,6 +129,7 @@ ) from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series +from pandas.protocol.wrapper import PandasDataFrame from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt @@ -520,6 +521,13 @@ def __init__( NDFrame.__init__(self, mgr) + @property + def __dataframe__(self) -> PandasDataFrame: + """ + DataFrame interchange protocol + """ + return PandasDataFrame(self) + # ---------------------------------------------------------------------- @property @@ -720,7 +728,7 @@ def _repr_html_(self) -> Optional[str]: show_dimensions = get_option("display.show_dimensions") formatter = fmt.DataFrameFormatter( - self, + self.__dataframe__, columns=None, col_space=None, na_rep="NaN", diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f011293273c5b..d18e18ede9500 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -79,6 +79,7 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: + from pandas.wesm import dataframe as dataframe_protocol from pandas import Series, DataFrame, Categorical FormattersType = Union[ @@ -540,7 +541,7 @@ class DataFrameFormatter(TableFormatter): def __init__( self, - frame: "DataFrame", + frame: "dataframe_protocol.DataFrame", columns: Optional[Sequence[str]] = None, col_space: Optional[Union[str, int]] = None, header: Union[bool, Sequence[str]] = True, diff --git a/pandas/protocol/__init__.py b/pandas/protocol/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py new file mode 100644 index 0000000000000..a9c4ea64938db --- /dev/null +++ b/pandas/protocol/wrapper.py @@ -0,0 +1,58 @@ +from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence + +from pandas.wesm import dataframe as dataframe_protocol + +if TYPE_CHECKING: + from pandas import DataFrame + + +class PandasDataFrame(dataframe_protocol.DataFrame): + """ + Construct generic data frame from pandas DataFrame + + Parameters + ---------- + df : pd.DataFrame + """ + + def __init__(self, df: "DataFrame"): + self._df = df + + def column_by_index(self, i: int) -> dataframe_protocol.Column: + """ + Return the column at the indicated position. + """ + pass + + def column_by_name(self, key: Hashable) -> dataframe_protocol.Column: + """ + Return the column whose name is the indicated key. + """ + pass + + @property + def column_names(self) -> Sequence[Any]: + """ + Return the column names as a materialized sequence. + """ + pass + + def iter_column_names(self) -> Iterable[Any]: + """ + Return the column names as an iterable. + """ + pass + + @property + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + pass + + @property + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame. + """ + pass diff --git a/pandas/wesm/__init__.py b/pandas/wesm/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/wesm/dataframe.py b/pandas/wesm/dataframe.py new file mode 100644 index 0000000000000..2c76f3d0e9d13 --- /dev/null +++ b/pandas/wesm/dataframe.py @@ -0,0 +1,270 @@ +# MIT License +# +# Copyright (c) 2020 Wes McKinney + +from abc import ABC, abstractmethod +from collections.abc import Mapping, MutableMapping +from typing import Any, Hashable, Iterable, Optional, Sequence + +# ---------------------------------------------------------------------- +# A simple data type class hierarchy for illustration + + +class DataType(ABC): + """ + A metadata object representing the logical value type of a cell in a data + frame column. This metadata does not guarantee an specific underlying data + representation + """ + def __eq__(self, other: 'DataType'): # type: ignore + return self.equals(other) + + def __str__(self): + return self.to_string() + + def __repr__(self): + return str(self) + + @abstractmethod + def to_string(self) -> str: + """ + Return human-readable representation of the data type + """ + + @abstractmethod + def equals(self, other: 'DataType') -> bool: + """ + Return true if other DataType contains the same metadata as this + DataType + """ + pass + + +class PrimitiveType(DataType): + + def equals(self, other: DataType) -> bool: + return type(self) == type(other) + + +class NullType(PrimitiveType): + """ + A data type whose values are always null + """ + def to_string(self): + return "null" + + +class Boolean(PrimitiveType): + + def to_string(self): + return "bool" + + +class NumberType(PrimitiveType): + pass + + +class IntegerType(NumberType): + pass + + +class SignedIntegerType(IntegerType): + pass + + +class Int8(SignedIntegerType): + + def to_string(self): + return "int8" + + +class Int16(SignedIntegerType): + + def to_string(self): + return "int16" + + +class Int32(SignedIntegerType): + + def to_string(self): + return "int32" + + +class Int64(SignedIntegerType): + + def to_string(self): + return "int64" + + +class Binary(PrimitiveType): + """ + A variable-size binary (bytes) value + """ + def to_string(self): + return "binary" + + +class String(PrimitiveType): + """ + A UTF8-encoded string value + """ + def to_string(self): + return "string" + + +class Object(PrimitiveType): + """ + Any PyObject value + """ + def to_string(self): + return "object" + + +class Categorical(DataType): + """ + A categorical value is an ordinal (integer) value that references a + sequence of category values of an arbitrary data type + """ + + def __init__(self, index_type: IntegerType, category_type: DataType, + ordered: bool = False): + self.index_type = index_type + self.category_type = category_type + self.ordered = ordered + + def equals(self, other: DataType) -> bool: + return (isinstance(other, Categorical) and + self.index_type == other.index_type and + self.category_type == other.category_type and + self.ordered == other.ordered) + + def to_string(self): + return ("categorical(indices={}, categories={}, ordered={})" + .format(str(self.index_type), str(self.category_type), + self.ordered)) + + +# ---------------------------------------------------------------------- +# Classes representing a column in a DataFrame + + +class Column(ABC): + + @property + @abstractmethod + def name(self) -> Hashable: + pass + + @property + @abstractmethod + def type(self) -> DataType: + """ + Return the logical type of each column cell value + """ + pass + + def to_numpy(self): + """ + Access column's data as a NumPy array. Recommended to return a view if + able but not required + """ + raise NotImplementedError("Conversion to NumPy not available") + + def to_arrow(self, **kwargs): + """ + Access column's data in the Apache Arrow format as pyarrow.Array or + ChunkedArray. Recommended to return a view if able but not required + """ + raise NotImplementedError("Conversion to Arrow not available") + + +# ---------------------------------------------------------------------- +# DataFrame: the main public API + + +class DataFrame(ABC, Mapping): + """ + An abstract data frame base class. + + A "data frame" represents an ordered collection of named columns. A + column's "name" is permitted to be any hashable Python value, but strings + are common. Names are not required to be unique. Columns may be accessed by + name (when the name is unique) or by position. + """ + + def __dataframe__(self): + """ + Idempotence of data frame protocol + """ + return self + + def __iter__(self): + # TBD: Decide what iterating should return + return iter(self.column_names) + + def __len__(self): + return self.num_rows + + @property + @abstractmethod + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame + """ + pass + + @property + @abstractmethod + def num_rows(self) -> Optional[int]: + """ + Return the number of rows in the DataFrame (if known) + """ + pass + + @abstractmethod + def iter_column_names(self) -> Iterable[Any]: + """ + Return the column names as an iterable + """ + pass + + # TODO: Should this be a method or property? + @property + @abstractmethod + def column_names(self) -> Sequence[Any]: + """ + Return the column names as a materialized sequence + """ + pass + + # TODO: Should this be a method or property? + @property + def row_names(self) -> Sequence[Any]: + """ + Return the row names (if any) as a materialized sequence. It is not + necessary to implement this method + """ + raise NotImplementedError("row_names") + + def __getitem__(self, key: Hashable) -> Column: + return self.column_by_name(key) + + @abstractmethod + def column_by_name(self, key: Hashable) -> Column: + """ + Return the column whose name is the indicated key + """ + pass + + @abstractmethod + def column_by_index(self, i: int) -> Column: + """ + Return the column at the indicated position + """ + pass + + +class MutableDataFrame(DataFrame, MutableMapping): + # TODO: Mutable data frames are fraught at this interface level and + # need more discussion + pass From 245c4f8ad9a677a3cdaf3866ce5e14caef37bda4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 21 Mar 2020 11:17:28 +0000 Subject: [PATCH 02/10] wip --- pandas/core/frame.py | 24 +++++++++++--- pandas/io/formats/format.py | 8 +++-- pandas/protocol/wrapper.py | 66 ++++++++++++++++++++++++++++++++----- pandas/wesm/dataframe.py | 38 +++++++++++---------- 4 files changed, 102 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dba018249f81a..d3b264b47a77a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,7 +129,7 @@ ) from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series -from pandas.protocol.wrapper import PandasDataFrame +from pandas.protocol.wrapper import DataFrame as DataFrameWrapper from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt @@ -139,6 +139,7 @@ if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy from pandas.io.formats.style import Styler + from pandas.wesm import dataframe as dataframe_protocol # noqa: F401 # --------------------------------------------------------------------- # Docstring templates @@ -436,6 +437,21 @@ def __init__( if isinstance(data, DataFrame): data = data._data + elif hasattr(data, "__dataframe__"): + # materialization as dict of numpy arrays + obj = cast("dataframe_protocol.DataFrame", data.__dataframe__) + + def _get_column(col): + try: + return col.to_numpy() + except NotImplementedError: + return col.to_arrow() + + data = { + column_name: _get_column(obj[column_name]) + for column_name in obj.column_names + } + if isinstance(data, BlockManager): mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy @@ -522,11 +538,11 @@ def __init__( NDFrame.__init__(self, mgr) @property - def __dataframe__(self) -> PandasDataFrame: + def __dataframe__(self) -> DataFrameWrapper: """ DataFrame interchange protocol """ - return PandasDataFrame(self) + return DataFrameWrapper(self) # ---------------------------------------------------------------------- @@ -728,7 +744,7 @@ def _repr_html_(self) -> Optional[str]: show_dimensions = get_option("display.show_dimensions") formatter = fmt.DataFrameFormatter( - self.__dataframe__, + self, columns=None, col_space=None, na_rep="NaN", diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d18e18ede9500..58db64ff96c51 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -79,7 +79,6 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas.wesm import dataframe as dataframe_protocol from pandas import Series, DataFrame, Categorical FormattersType = Union[ @@ -541,7 +540,7 @@ class DataFrameFormatter(TableFormatter): def __init__( self, - frame: "dataframe_protocol.DataFrame", + frame: "DataFrame", columns: Optional[Sequence[str]] = None, col_space: Optional[Union[str, int]] = None, header: Union[bool, Sequence[str]] = True, @@ -563,7 +562,10 @@ def __init__( bold_rows: bool = False, escape: bool = True, ): - self.frame = frame + from pandas.core.frame import DataFrame + + # round-trip pandas DataFrame thro interchange protocol + self.frame = DataFrame(frame.__dataframe__) self.show_index_names = index_names if sparsify is None: diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py index a9c4ea64938db..300ce726b8853 100644 --- a/pandas/protocol/wrapper.py +++ b/pandas/protocol/wrapper.py @@ -3,10 +3,50 @@ from pandas.wesm import dataframe as dataframe_protocol if TYPE_CHECKING: - from pandas import DataFrame + import pandas as pd -class PandasDataFrame(dataframe_protocol.DataFrame): +class Column(dataframe_protocol.Column): + """ + Construct generic column from pandas Series + + Parameters + ---------- + ser : pd.Series + """ + + _ser: "pd.Series" + + def __init__(self, ser: "pd.Series"): + self._ser = ser + + @property + def name(self) -> Hashable: + raise NotImplementedError + + @property + def type(self) -> dataframe_protocol.DataType: + """ + Return the logical type of each column cell value + """ + raise NotImplementedError + + def to_numpy(self): + """ + Access column's data as a NumPy array. Recommended to return a view if + able but not required + """ + return self._ser.to_numpy() + + def to_arrow(self, **kwargs): + """ + Access column's data in the Apache Arrow format as pyarrow.Array or + ChunkedArray. Recommended to return a view if able but not required + """ + raise NotImplementedError("Conversion to Arrow not available") + + +class DataFrame(dataframe_protocol.DataFrame): """ Construct generic data frame from pandas DataFrame @@ -15,44 +55,52 @@ class PandasDataFrame(dataframe_protocol.DataFrame): df : pd.DataFrame """ - def __init__(self, df: "DataFrame"): + _df: "pd.DataFrame" + + def __init__(self, df: "pd.DataFrame"): self._df = df + def __str__(self): + return str(self._df) + + def __repr__(self): + return repr(self._df) + def column_by_index(self, i: int) -> dataframe_protocol.Column: """ Return the column at the indicated position. """ - pass + raise NotImplementedError def column_by_name(self, key: Hashable) -> dataframe_protocol.Column: """ Return the column whose name is the indicated key. """ - pass + return Column(self._df[key]) @property def column_names(self) -> Sequence[Any]: """ Return the column names as a materialized sequence. """ - pass + return self._df.columns.to_list() def iter_column_names(self) -> Iterable[Any]: """ Return the column names as an iterable. """ - pass + raise NotImplementedError @property def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ - pass + raise NotImplementedError @property def num_rows(self) -> int: """ Return the number of rows in the DataFrame. """ - pass + raise NotImplementedError diff --git a/pandas/wesm/dataframe.py b/pandas/wesm/dataframe.py index 2c76f3d0e9d13..2976b975cb9fb 100644 --- a/pandas/wesm/dataframe.py +++ b/pandas/wesm/dataframe.py @@ -16,7 +16,8 @@ class DataType(ABC): frame column. This metadata does not guarantee an specific underlying data representation """ - def __eq__(self, other: 'DataType'): # type: ignore + + def __eq__(self, other: "DataType"): # type: ignore return self.equals(other) def __str__(self): @@ -32,7 +33,7 @@ def to_string(self) -> str: """ @abstractmethod - def equals(self, other: 'DataType') -> bool: + def equals(self, other: "DataType") -> bool: """ Return true if other DataType contains the same metadata as this DataType @@ -41,7 +42,6 @@ def equals(self, other: 'DataType') -> bool: class PrimitiveType(DataType): - def equals(self, other: DataType) -> bool: return type(self) == type(other) @@ -50,12 +50,12 @@ class NullType(PrimitiveType): """ A data type whose values are always null """ + def to_string(self): return "null" class Boolean(PrimitiveType): - def to_string(self): return "bool" @@ -73,25 +73,21 @@ class SignedIntegerType(IntegerType): class Int8(SignedIntegerType): - def to_string(self): return "int8" class Int16(SignedIntegerType): - def to_string(self): return "int16" class Int32(SignedIntegerType): - def to_string(self): return "int32" class Int64(SignedIntegerType): - def to_string(self): return "int64" @@ -100,6 +96,7 @@ class Binary(PrimitiveType): """ A variable-size binary (bytes) value """ + def to_string(self): return "binary" @@ -108,6 +105,7 @@ class String(PrimitiveType): """ A UTF8-encoded string value """ + def to_string(self): return "string" @@ -116,6 +114,7 @@ class Object(PrimitiveType): """ Any PyObject value """ + def to_string(self): return "object" @@ -126,22 +125,25 @@ class Categorical(DataType): sequence of category values of an arbitrary data type """ - def __init__(self, index_type: IntegerType, category_type: DataType, - ordered: bool = False): + def __init__( + self, index_type: IntegerType, category_type: DataType, ordered: bool = False + ): self.index_type = index_type self.category_type = category_type self.ordered = ordered def equals(self, other: DataType) -> bool: - return (isinstance(other, Categorical) and - self.index_type == other.index_type and - self.category_type == other.category_type and - self.ordered == other.ordered) + return ( + isinstance(other, Categorical) + and self.index_type == other.index_type + and self.category_type == other.category_type + and self.ordered == other.ordered + ) def to_string(self): - return ("categorical(indices={}, categories={}, ordered={})" - .format(str(self.index_type), str(self.category_type), - self.ordered)) + return "categorical(indices={}, categories={}, ordered={})".format( + str(self.index_type), str(self.category_type), self.ordered + ) # ---------------------------------------------------------------------- @@ -149,7 +151,6 @@ def to_string(self): class Column(ABC): - @property @abstractmethod def name(self) -> Hashable: @@ -192,6 +193,7 @@ class DataFrame(ABC, Mapping): name (when the name is unique) or by position. """ + @property def __dataframe__(self): """ Idempotence of data frame protocol From 002ec98bf461a9c38e67346583c46dab5e0f03fa Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 08:24:14 +0000 Subject: [PATCH 03/10] wip --- pandas/core/frame.py | 3 ++- pandas/io/formats/format.py | 5 +--- pandas/protocol/wrapper.py | 17 ++++++------- pandas/tests/test_downstream.py | 42 +++++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3b264b47a77a..19d15f3d8af91 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -438,7 +438,8 @@ def __init__( data = data._data elif hasattr(data, "__dataframe__"): - # materialization as dict of numpy arrays + # construct using dict of numpy arrays + # TODO: index, columns, dtype and copy arguments obj = cast("dataframe_protocol.DataFrame", data.__dataframe__) def _get_column(col): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 58db64ff96c51..f011293273c5b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -562,10 +562,7 @@ def __init__( bold_rows: bool = False, escape: bool = True, ): - from pandas.core.frame import DataFrame - - # round-trip pandas DataFrame thro interchange protocol - self.frame = DataFrame(frame.__dataframe__) + self.frame = frame self.show_index_names = index_names if sparsify is None: diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py index 300ce726b8853..c2d5fa2a08d1e 100644 --- a/pandas/protocol/wrapper.py +++ b/pandas/protocol/wrapper.py @@ -4,6 +4,7 @@ if TYPE_CHECKING: import pandas as pd + import numpy as np class Column(dataframe_protocol.Column): @@ -22,7 +23,7 @@ def __init__(self, ser: "pd.Series"): @property def name(self) -> Hashable: - raise NotImplementedError + return self._ser.name @property def type(self) -> dataframe_protocol.DataType: @@ -31,7 +32,7 @@ def type(self) -> dataframe_protocol.DataType: """ raise NotImplementedError - def to_numpy(self): + def to_numpy(self) -> "np.ndarray": """ Access column's data as a NumPy array. Recommended to return a view if able but not required @@ -60,17 +61,17 @@ class DataFrame(dataframe_protocol.DataFrame): def __init__(self, df: "pd.DataFrame"): self._df = df - def __str__(self): + def __str__(self) -> str: return str(self._df) - def __repr__(self): + def __repr__(self) -> str: return repr(self._df) def column_by_index(self, i: int) -> dataframe_protocol.Column: """ Return the column at the indicated position. """ - raise NotImplementedError + return Column(self._df.iloc[:, i]) def column_by_name(self, key: Hashable) -> dataframe_protocol.Column: """ @@ -89,18 +90,18 @@ def iter_column_names(self) -> Iterable[Any]: """ Return the column names as an iterable. """ - raise NotImplementedError + return self.column_names @property def num_columns(self) -> int: """ Return the number of columns in the DataFrame. """ - raise NotImplementedError + return self._df.shape[1] @property def num_rows(self) -> int: """ Return the number of rows in the DataFrame. """ - raise NotImplementedError + return len(self._df) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 122ef1f47968e..6ff04b188ae79 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,6 +10,7 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.wesm import dataframe as dataframe_protocol def import_module(name): @@ -147,3 +148,44 @@ def test_missing_required_dependency(): output = exc.value.stdout.decode() for name in ["numpy", "pytz", "dateutil"]: assert name in output + + +# ----------------------------------------------------------------------------- +# DataFrame interchange protocol +# ----------------------------------------------------------------------------- + + +class TestDataFrameProtocol: + def test_interface_smoketest(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + result = df.__dataframe__ + assert isinstance(result, dataframe_protocol.DataFrame) + assert isinstance(result["a"], dataframe_protocol.Column) + assert isinstance(result.column_by_index(0), dataframe_protocol.Column) + # assert isinstance(result['a'].dtype, dataframe_protocol.DataType) + + assert result.num_rows == 3 + assert result.num_columns == 2 + assert result.column_names == ["a", "b"] + assert list(result.iter_column_names()) == ["a", "b"] + + expected = np.array([1, 2, 3], dtype=np.int64) + res = result["a"].to_numpy() + tm.assert_numpy_array_equal(res, expected) + res = result.column_by_index(0).to_numpy() + tm.assert_numpy_array_equal(res, expected) + + assert result["a"].name == "a" + assert result.column_by_index(0).name == 'a' + + def test_pandas_dataframe_constructor(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + result = DataFrame(df) + tm.assert_frame_equal(result, df) + assert result is not df + + result = DataFrame(df.__dataframe__) + tm.assert_frame_equal(result, df) + assert result is not df From 0665e95edfb33ad3a426a3cd6f27c965ba89898b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 08:42:55 +0000 Subject: [PATCH 04/10] mypy fixup make Column.name Optional --- pandas/protocol/wrapper.py | 4 ++-- pandas/wesm/dataframe.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py index c2d5fa2a08d1e..9e68b4d84066b 100644 --- a/pandas/protocol/wrapper.py +++ b/pandas/protocol/wrapper.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence +from typing import TYPE_CHECKING, Any, Hashable, Iterable, Optional, Sequence from pandas.wesm import dataframe as dataframe_protocol @@ -22,7 +22,7 @@ def __init__(self, ser: "pd.Series"): self._ser = ser @property - def name(self) -> Hashable: + def name(self) -> Optional[Hashable]: return self._ser.name @property diff --git a/pandas/wesm/dataframe.py b/pandas/wesm/dataframe.py index 2976b975cb9fb..1ccca2e39d307 100644 --- a/pandas/wesm/dataframe.py +++ b/pandas/wesm/dataframe.py @@ -153,7 +153,7 @@ def to_string(self): class Column(ABC): @property @abstractmethod - def name(self) -> Hashable: + def name(self) -> Optional[Hashable]: pass @property From e78d79746eecf885d993946455c62212aceb612e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 14:37:56 +0000 Subject: [PATCH 05/10] wip --- pandas/core/frame.py | 12 +++++++- pandas/protocol/wrapper.py | 8 ++++++ pandas/tests/test_downstream.py | 49 ++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6c19ea567e1f..dc0e68f36c877 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -439,7 +439,7 @@ def __init__( elif hasattr(data, "__dataframe__"): # construct using dict of numpy arrays - # TODO: index, columns, dtype and copy arguments + # TODO(simonjayhawkins) index, columns, dtype and copy arguments obj = cast("dataframe_protocol.DataFrame", data.__dataframe__) def _get_column(col): @@ -453,6 +453,16 @@ def _get_column(col): for column_name in obj.column_names } + if not index: + try: + index = MultiIndex.from_tuples(obj.row_names) + except TypeError: + index = obj.row_names + except NotImplementedError: + # It is not necessary to implement row_names in the + # dataframe interchange protocol + pass + if isinstance(data, BlockManager): mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py index 9e68b4d84066b..5a8be41f2160e 100644 --- a/pandas/protocol/wrapper.py +++ b/pandas/protocol/wrapper.py @@ -86,6 +86,14 @@ def column_names(self) -> Sequence[Any]: """ return self._df.columns.to_list() + @property + def row_names(self) -> Sequence[Any]: + """ + Return the row names (if any) as a materialized sequence. It is not + necessary to implement this method + """ + return self._df.index.to_list() + def iter_column_names(self) -> Iterable[Any]: """ Return the column names as an iterable. diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 6ff04b188ae79..125e0c8aa5308 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,6 +10,7 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.protocol.wrapper import DataFrame as DataFrameWrapper from pandas.wesm import dataframe as dataframe_protocol @@ -163,12 +164,14 @@ def test_interface_smoketest(self): assert isinstance(result, dataframe_protocol.DataFrame) assert isinstance(result["a"], dataframe_protocol.Column) assert isinstance(result.column_by_index(0), dataframe_protocol.Column) + # TODO(simonjayhawkins) don't leave commented out # assert isinstance(result['a'].dtype, dataframe_protocol.DataType) assert result.num_rows == 3 assert result.num_columns == 2 assert result.column_names == ["a", "b"] assert list(result.iter_column_names()) == ["a", "b"] + assert result.row_names == [0, 1, 2] expected = np.array([1, 2, 3], dtype=np.int64) res = result["a"].to_numpy() @@ -177,9 +180,10 @@ def test_interface_smoketest(self): tm.assert_numpy_array_equal(res, expected) assert result["a"].name == "a" - assert result.column_by_index(0).name == 'a' + assert result.column_by_index(0).name == "a" def test_pandas_dataframe_constructor(self): + # TODO(simonjayhawkins): move to test_constructors.py df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = DataFrame(df) @@ -189,3 +193,46 @@ def test_pandas_dataframe_constructor(self): result = DataFrame(df.__dataframe__) tm.assert_frame_equal(result, df) assert result is not df + + # It is not necessary to implement row_names in the + # dataframe interchange protocol + + # TODO(simonjayhawkins) how to monkeypatch property with pytest + # raises AttributeError: can't set attribute + + class _DataFrameWrapper(DataFrameWrapper): + @property + def row_names(self): + raise NotImplementedError("row_names") + + result = _DataFrameWrapper(df) + with pytest.raises(NotImplementedError, match="row_names"): + result.row_names + + result = DataFrame(result) + tm.assert_frame_equal(result, df) + + def test_multiindex(self): + df = ( + DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + .reset_index() + .set_index(["index", "a"]) + ) + result = df.__dataframe__ + + assert result.row_names == [(0, 1), (1, 2), (2, 3)] + + # TODO(simonjayhawkins) split this test and move to test_constructors.py + result = DataFrame(result) + # index and column names are not available from the protocol api + tm.assert_frame_equal(result, df, check_names=False) + + df = df.unstack() + result = df.__dataframe__ + + assert result.column_names == [("b", 1), ("b", 2), ("b", 3)] + + # TODO(simonjayhawkins) split this test and move to test_constructors.py + result = DataFrame(result) + # index and column names are not available from the protocol api + tm.assert_frame_equal(result, df, check_names=False) From 6ddfaf5a2db196348c6c937a85f752605e47a296 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 16:23:02 +0000 Subject: [PATCH 06/10] wip --- pandas/protocol/wrapper.py | 32 +----- pandas/tests/test_downstream.py | 16 ++- pandas/wesm/example_dict_of_ndarray.py | 152 +++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 31 deletions(-) create mode 100644 pandas/wesm/example_dict_of_ndarray.py diff --git a/pandas/protocol/wrapper.py b/pandas/protocol/wrapper.py index 5a8be41f2160e..e4652800421e4 100644 --- a/pandas/protocol/wrapper.py +++ b/pandas/protocol/wrapper.py @@ -1,13 +1,13 @@ -from typing import TYPE_CHECKING, Any, Hashable, Iterable, Optional, Sequence +from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence from pandas.wesm import dataframe as dataframe_protocol +from pandas.wesm.example_dict_of_ndarray import NumPyColumn if TYPE_CHECKING: import pandas as pd - import numpy as np -class Column(dataframe_protocol.Column): +class Column(NumPyColumn): """ Construct generic column from pandas Series @@ -20,31 +20,7 @@ class Column(dataframe_protocol.Column): def __init__(self, ser: "pd.Series"): self._ser = ser - - @property - def name(self) -> Optional[Hashable]: - return self._ser.name - - @property - def type(self) -> dataframe_protocol.DataType: - """ - Return the logical type of each column cell value - """ - raise NotImplementedError - - def to_numpy(self) -> "np.ndarray": - """ - Access column's data as a NumPy array. Recommended to return a view if - able but not required - """ - return self._ser.to_numpy() - - def to_arrow(self, **kwargs): - """ - Access column's data in the Apache Arrow format as pyarrow.Array or - ChunkedArray. Recommended to return a view if able but not required - """ - raise NotImplementedError("Conversion to Arrow not available") + super().__init__(ser.name, ser.to_numpy()) class DataFrame(dataframe_protocol.DataFrame): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 125e0c8aa5308..66893819eac0f 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -11,7 +11,7 @@ from pandas import DataFrame import pandas._testing as tm from pandas.protocol.wrapper import DataFrame as DataFrameWrapper -from pandas.wesm import dataframe as dataframe_protocol +from pandas.wesm import dataframe as dataframe_protocol, example_dict_of_ndarray def import_module(name): @@ -164,8 +164,7 @@ def test_interface_smoketest(self): assert isinstance(result, dataframe_protocol.DataFrame) assert isinstance(result["a"], dataframe_protocol.Column) assert isinstance(result.column_by_index(0), dataframe_protocol.Column) - # TODO(simonjayhawkins) don't leave commented out - # assert isinstance(result['a'].dtype, dataframe_protocol.DataType) + assert isinstance(result["a"].type, dataframe_protocol.DataType) assert result.num_rows == 3 assert result.num_columns == 2 @@ -182,6 +181,10 @@ def test_interface_smoketest(self): assert result["a"].name == "a" assert result.column_by_index(0).name == "a" + expected_type = dataframe_protocol.Int64() + assert result["a"].type == expected_type + assert result.column_by_index(0).type == expected_type + def test_pandas_dataframe_constructor(self): # TODO(simonjayhawkins): move to test_constructors.py df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -236,3 +239,10 @@ def test_multiindex(self): result = DataFrame(result) # index and column names are not available from the protocol api tm.assert_frame_equal(result, df, check_names=False) + + def test_example_dict_of_ndarray(self): + data, names, df = example_dict_of_ndarray.get_example() + df = DataFrame(df) + expected = DataFrame(data) + tm.assert_frame_equal(df, expected) + assert df.columns.to_list() == names diff --git a/pandas/wesm/example_dict_of_ndarray.py b/pandas/wesm/example_dict_of_ndarray.py new file mode 100644 index 0000000000000..c2f7e4a741a3a --- /dev/null +++ b/pandas/wesm/example_dict_of_ndarray.py @@ -0,0 +1,152 @@ +# MIT License +# +# Copyright (c) 2020 Wes McKinney + +from typing import Dict, Hashable, Sequence +import pandas.wesm.dataframe as dataframe + +import numpy as np + + +_numeric_types = { + "int8": dataframe.Int8(), + "int16": dataframe.Int16(), + "int32": dataframe.Int32(), + "int64": dataframe.Int64(), +} + + +def _integer_factory(dtype): + return _numeric_types[dtype.name] + + +def _constant_factory(type_instance): + def factory(*unused): + return type_instance + + return factory + + +_type_factories = { + "b": _constant_factory(dataframe.Boolean()), + "i": _integer_factory, + "O": _constant_factory(dataframe.Object()), + "S": _constant_factory(dataframe.Binary()), + "U": _constant_factory(dataframe.String()), +} + + +class NumPyColumn(dataframe.Column): + def __init__(self, name, data): + self._name = name + self._data = data + + @property + def name(self) -> Hashable: + return self._name + + @property + def type(self) -> dataframe.DataType: + factory = _type_factories.get(self._data.dtype.kind) + if factory is None: + raise NotImplementedError( + "Data frame type for NumPy Type {} " + "not known".format(str(self._data.dtype)) + ) + return factory(self._data.dtype) + + def to_numpy(self): + return self._data + + +class DictDataFrame(dataframe.DataFrame): + """ + Construct data frame from dict of NumPy arrays + + Parameters + ---------- + data : dict + names : sequence, default None + If not passed, the names will be determined by the data's keys + num_rows : int, default None + If not passed, determined from the data + """ + + def __init__( + self, + columns: Dict[Hashable, np.ndarray], + names: Sequence[Hashable] = None, + num_rows: int = None, + ): + if names is None: + names = list(columns.keys()) + + assert len(columns) == len(names) + + self._columns = columns.copy() + self._names = list(names) + # self._name_to_index = {i: k for i, k in enumerate(self._names)} + + if len(columns) > 0: + assert num_rows is None + self._num_rows = len(next(iter(columns.values()))) + else: + self._num_rows = num_rows + + @property + def num_columns(self): + return len(self._columns) + + @property + def num_rows(self): + return self._num_rows + + def iter_column_names(self): + return iter(self._names) + + @property + def column_names(self): + return self._names + + def column_by_name(self, key: Hashable) -> NumPyColumn: + return NumPyColumn(key, self._columns[key]) + + def column_by_index(self, i: int) -> NumPyColumn: + return NumPyColumn(self._names[i], self._columns[self._names[i]]) + + +def get_example(): + data = { + "a": np.array([1, 2, 3, 4, 5], dtype="int64"), + "b": np.array(["a", "b", "c", "d", "e"]), + "c": np.array([True, False, True, False, True]), + } + names = ["a", "b", "c"] + return data, names, DictDataFrame(data, names=names) + + +def test_basic_behavior(): + raw_data, names, df = get_example() + + assert len(df) == 5 + assert df.num_columns == 3 + assert df.num_rows == 5 + + for i, name in enumerate(df.column_names): + assert name == names[i] + + for i, name in enumerate(df.iter_column_names()): + assert name == names[i] + + expected_types = { + "a": dataframe.Int64(), + "b": dataframe.String(), + "c": dataframe.Boolean(), + } + + for i, name in enumerate(names): + col = df[name] + assert col.name == name + assert col.type == expected_types[name] + assert col.to_numpy() is raw_data[name] + assert df.column_by_index(i).name == col.name From 46f594e85d34e5b8257ce4ab83c8712b1b1daa49 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 16:24:42 +0000 Subject: [PATCH 07/10] isort fixup --- pandas/wesm/example_dict_of_ndarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/wesm/example_dict_of_ndarray.py b/pandas/wesm/example_dict_of_ndarray.py index c2f7e4a741a3a..80dceef96df79 100644 --- a/pandas/wesm/example_dict_of_ndarray.py +++ b/pandas/wesm/example_dict_of_ndarray.py @@ -3,10 +3,10 @@ # Copyright (c) 2020 Wes McKinney from typing import Dict, Hashable, Sequence -import pandas.wesm.dataframe as dataframe import numpy as np +import pandas.wesm.dataframe as dataframe _numeric_types = { "int8": dataframe.Int8(), From 1ce9779c19ebb1262bef546a5a294fb3536d4b5d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 16:43:26 +0000 Subject: [PATCH 08/10] mypy fixup --- pandas/wesm/example_dict_of_ndarray.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/wesm/example_dict_of_ndarray.py b/pandas/wesm/example_dict_of_ndarray.py index 80dceef96df79..c90302d8c96f3 100644 --- a/pandas/wesm/example_dict_of_ndarray.py +++ b/pandas/wesm/example_dict_of_ndarray.py @@ -2,7 +2,7 @@ # # Copyright (c) 2020 Wes McKinney -from typing import Dict, Hashable, Sequence +from typing import Any, Dict, Hashable, Optional, Sequence import numpy as np @@ -71,12 +71,13 @@ class DictDataFrame(dataframe.DataFrame): num_rows : int, default None If not passed, determined from the data """ + _num_rows: Optional[int] def __init__( self, columns: Dict[Hashable, np.ndarray], - names: Sequence[Hashable] = None, - num_rows: int = None, + names: Optional[Sequence[Hashable]] = None, + num_rows: Optional[int] = None, ): if names is None: names = list(columns.keys()) @@ -116,7 +117,7 @@ def column_by_index(self, i: int) -> NumPyColumn: def get_example(): - data = { + data: Dict[Hashable, Any] = { "a": np.array([1, 2, 3, 4, 5], dtype="int64"), "b": np.array(["a", "b", "c", "d", "e"]), "c": np.array([True, False, True, False, True]), From 15b1622c67599cf007a9ce1d39c8efd20710cb7e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 19:09:05 +0000 Subject: [PATCH 09/10] code checks fixup --- pandas/wesm/dataframe.py | 6 +++--- pandas/wesm/example_dict_of_ndarray.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/wesm/dataframe.py b/pandas/wesm/dataframe.py index 1ccca2e39d307..d8e1b5b63dc3d 100644 --- a/pandas/wesm/dataframe.py +++ b/pandas/wesm/dataframe.py @@ -3,7 +3,7 @@ # Copyright (c) 2020 Wes McKinney from abc import ABC, abstractmethod -from collections.abc import Mapping, MutableMapping +from collections import abc from typing import Any, Hashable, Iterable, Optional, Sequence # ---------------------------------------------------------------------- @@ -183,7 +183,7 @@ def to_arrow(self, **kwargs): # DataFrame: the main public API -class DataFrame(ABC, Mapping): +class DataFrame(ABC, abc.Mapping): """ An abstract data frame base class. @@ -266,7 +266,7 @@ def column_by_index(self, i: int) -> Column: pass -class MutableDataFrame(DataFrame, MutableMapping): +class MutableDataFrame(DataFrame, abc.MutableMapping): # TODO: Mutable data frames are fraught at this interface level and # need more discussion pass diff --git a/pandas/wesm/example_dict_of_ndarray.py b/pandas/wesm/example_dict_of_ndarray.py index c90302d8c96f3..ba22ed5ea9a1a 100644 --- a/pandas/wesm/example_dict_of_ndarray.py +++ b/pandas/wesm/example_dict_of_ndarray.py @@ -71,6 +71,7 @@ class DictDataFrame(dataframe.DataFrame): num_rows : int, default None If not passed, determined from the data """ + _num_rows: Optional[int] def __init__( From ef2e97d2eb1d0b39261bbd507a811dc7a17915bd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Mar 2020 19:14:27 +0000 Subject: [PATCH 10/10] test fixup --- pandas/tests/api/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 5aab5b814bae7..97209b64afb8f 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -203,6 +203,8 @@ class TestPDApi(Base): "_tslib", "_typing", "_version", + "protocol", + "wesm", ] def test_api(self):