From 4e97e0552f04476a597002861d1cb91880e00be9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2019 14:24:29 +0100 Subject: [PATCH 1/7] Add __from_arrow__ support for IntegerArray --- pandas/core/arrays/integer.py | 28 ++++++++++++++++++++++++++++ pandas/tests/arrays/test_integer.py | 12 ++++++++++++ pandas/tests/io/test_parquet.py | 14 ++++++++++---- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 08b53e54b91ef..d841e247e495d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -85,6 +85,34 @@ def construct_array_type(cls): """ return IntegerArray + def __from_arrow__(self, array): + """Construct IntegerArray from passed pyarrow Array""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=self.type)[ + arr.offset : arr.offset + len(arr) + ] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + int_arr = IntegerArray(data.copy(), ~mask, copy=False) + results.append(int_arr) + + return IntegerArray._concat_same_type(results) + def integer_array(values, dtype=None, copy=False): """ diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 793de66767cc3..13e925f37f41f 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -829,6 +829,18 @@ def test_arrow_array(data): assert arr.equals(expected) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index debc797fe6e88..3d5554fe26435 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -514,13 +514,19 @@ def test_additional_extension_arrays(self, pa): "b": pd.Series(["a", None, "c"], dtype="string"), } ) - # currently de-serialized as plain int / object - expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df.assign(b=df.b.astype("object")) + else: + # de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values in integer, currently de-serialized as float - expected = df.assign(a=df.a.astype("float64")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # if missing values in integer, currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) From 27dcbfbaae86798cf94eb6a1cdd95604e9fc2d5b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2019 14:35:02 +0100 Subject: [PATCH 2/7] Add __from_arrow__ support for StringArray --- pandas/core/arrays/string_.py | 16 ++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 14 ++++++++++++++ pandas/tests/io/test_parquet.py | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..c185cc9495005 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -85,6 +85,22 @@ def construct_array_type(cls) -> "Type[StringArray]": def __repr__(self) -> str: return "StringDtype" + def __from_arrow__(self, array): + """Construct StringArray from passed pyarrow Array""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + str_arr = StringArray(np.array(arr)) + results.append(str_arr) + + return StringArray._concat_same_type(results) + class StringArray(PandasArray): """ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..843e09ea891eb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -171,3 +171,17 @@ def test_arrow_array(): arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array(["a", "b", "c"], dtype="string") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3d5554fe26435..e42904e40768a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -515,7 +515,7 @@ def test_additional_extension_arrays(self, pa): } ) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): - expected = df.assign(b=df.b.astype("object")) + expected = df else: # de-serialized as plain int / object expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) From 72f41428ac7f0b529968b7f67805dbc696c8f93c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2019 14:41:05 +0100 Subject: [PATCH 3/7] add whatsnew / docs --- doc/source/user_guide/io.rst | 1 + doc/source/whatsnew/v1.0.0.rst | 3 +++ 2 files changed, 4 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ef87b6c57b1b9..756ddda984087 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4717,6 +4717,7 @@ Several caveats. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. +* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data type (requiring pyarrow >= 1.0.0). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb1d80a34514c..4fc5be83edd94 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -113,6 +113,9 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- Roundtripping DataFrames with nullable integer or string data types to parquet + (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). Build Changes ^^^^^^^^^^^^^ From 38218adfde25d91687d4ca24619eedb33802b5ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Nov 2019 17:19:42 +0100 Subject: [PATCH 4/7] fix None -> np.nan conversion --- pandas/core/arrays/string_.py | 3 ++- pandas/tests/arrays/string_/test_string.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c185cc9495005..5009da32654eb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -96,7 +96,8 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - str_arr = StringArray(np.array(arr)) + # using _from_sequence to ensure None is convered to np.nan + str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) return StringArray._concat_same_type(results) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 843e09ea891eb..1ce62d8f8b3d9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -178,10 +178,12 @@ def test_arrow_roundtrip(): # roundtrip possible from arrow 1.0.0 import pyarrow as pa - data = pd.array(["a", "b", "c"], dtype="string") + data = pd.array(["a", "b", None], dtype="string") df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) tm.assert_frame_equal(result, df) + # ensure the missing value is represented by NaN and not None + assert np.isnan(result.loc[2, "a"]) From 0521875b53d7e0d782aa5c9175fa996a0d76b08f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Nov 2019 17:39:56 +0100 Subject: [PATCH 5/7] better documentation + add comment --- doc/source/development/extending.rst | 44 ++++++++++++++++++++++++++++ doc/source/user_guide/io.rst | 4 ++- pandas/core/arrays/integer.py | 3 +- pandas/core/arrays/string_.py | 3 +- 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e341dcb8318bc..d4fd4edf729d8 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -251,6 +251,50 @@ To use a test, subclass it: See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py for a list of all the tests available. +.. _extending.extension.arrow: + +Compatibility with Apache Arrow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An ``ExtensionArray`` can support conversion to / from ``pyarrow`` arrays +(and thus support for example serialization to the Parquet file format) +by implementing two methods: ``ExtensionArray.__arrow_array__`` and +``ExtensionDtype.__from_arrow__``. + +The ``ExtensionArray.__arrow_array__`` ensures that ``pyarrow`` knowns how +to convert the specific extension array into a ``pyarrow.Array`` (also when +included as a column in a pandas DataFrame): + +.. code-block:: python + + class MyExtensionArray(ExtensionArray): + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + return pyarrow.array(..., type=type) + +The ``ExtensionDtype.__from_arrow__`` method then controls the conversion +back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow +``Array`` or ``ChunkedArray`` as only argument and is expected to return the +appropriate pandas ``ExtensionArray`` for this dtype and the passed values: + +.. code-block:: python + + class ExtensionDtype: + ... + + def __from_arrow__( + self, array: pyarrow.Array/ChunkedArray + ) -> ExtensionArray: + ... + +See more in the `Arrow documentation `__. + +Those methods have been implemented for the nullable integer and string extension +dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. + .. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1d44bae46c4e2..519ddbae9ff87 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4717,7 +4717,9 @@ Several caveats. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. -* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data type (requiring pyarrow >= 1.0.0). +* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data + type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d841e247e495d..d0669de78a47b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -86,12 +86,13 @@ def construct_array_type(cls): return IntegerArray def __from_arrow__(self, array): - """Construct IntegerArray from passed pyarrow Array""" + """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] else: + # pyarrow.ChunkedArray chunks = array.chunks results = [] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5009da32654eb..8599b5e39f34a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -86,12 +86,13 @@ def __repr__(self) -> str: return "StringDtype" def __from_arrow__(self, array): - """Construct StringArray from passed pyarrow Array""" + """Construct StringArray from passed pyarrow Array/ChunkedArray""" import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] else: + # pyarrow.ChunkedArray chunks = array.chunks results = [] From e593f9ef2803368610995812b614c006ad0cee91 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Nov 2019 09:51:04 +0100 Subject: [PATCH 6/7] fix flake8 --- doc/source/development/extending.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d4fd4edf729d8..26b4d3d2860c4 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -280,14 +280,12 @@ back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow ``Array`` or ``ChunkedArray`` as only argument and is expected to return the appropriate pandas ``ExtensionArray`` for this dtype and the passed values: -.. code-block:: python +.. code-block:: class ExtensionDtype: ... - def __from_arrow__( - self, array: pyarrow.Array/ChunkedArray - ) -> ExtensionArray: + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: ... See more in the `Arrow documentation `__. From c317aab843fd1cf0ffe907c07d9ad7322ae91bb4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Nov 2019 13:09:30 +0100 Subject: [PATCH 7/7] ignore linting --- doc/source/development/extending.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 26b4d3d2860c4..89d43e8a43825 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -280,7 +280,7 @@ back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow ``Array`` or ``ChunkedArray`` as only argument and is expected to return the appropriate pandas ``ExtensionArray`` for this dtype and the passed values: -.. code-block:: +.. code-block:: none class ExtensionDtype: ...