From 4e97e0552f04476a597002861d1cb91880e00be9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2019 14:24:29 +0100
Subject: [PATCH 1/7] Add __from_arrow__ support for IntegerArray

---
 pandas/core/arrays/integer.py       | 28 ++++++++++++++++++++++++++++
 pandas/tests/arrays/test_integer.py | 12 ++++++++++++
 pandas/tests/io/test_parquet.py     | 14 ++++++++++----
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 08b53e54b91ef..d841e247e495d 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -85,6 +85,34 @@ def construct_array_type(cls):
         """
         return IntegerArray
 
+    def __from_arrow__(self, array):
+        """Construct IntegerArray from passed pyarrow Array"""
+        import pyarrow
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            buflist = arr.buffers()
+            data = np.frombuffer(buflist[1], dtype=self.type)[
+                arr.offset : arr.offset + len(arr)
+            ]
+            bitmask = buflist[0]
+            if bitmask is not None:
+                mask = pyarrow.BooleanArray.from_buffers(
+                    pyarrow.bool_(), len(arr), [None, bitmask]
+                )
+                mask = np.asarray(mask)
+            else:
+                mask = np.ones(len(arr), dtype=bool)
+            int_arr = IntegerArray(data.copy(), ~mask, copy=False)
+            results.append(int_arr)
+
+        return IntegerArray._concat_same_type(results)
+
 
 def integer_array(values, dtype=None, copy=False):
     """
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
index 793de66767cc3..13e925f37f41f 100644
--- a/pandas/tests/arrays/test_integer.py
+++ b/pandas/tests/arrays/test_integer.py
@@ -829,6 +829,18 @@ def test_arrow_array(data):
     assert arr.equals(expected)
 
 
+@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+def test_arrow_roundtrip(data):
+    # roundtrip possible from arrow 1.0.0
+    import pyarrow as pa
+
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    assert table.field("a").type == str(data.dtype.numpy_dtype)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
 # TODO(jreback) - these need testing / are broken
 
 # shift
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index debc797fe6e88..3d5554fe26435 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -514,13 +514,19 @@ def test_additional_extension_arrays(self, pa):
                 "b": pd.Series(["a", None, "c"], dtype="string"),
             }
         )
-        # currently de-serialized as plain int / object
-        expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
+        if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
+            expected = df.assign(b=df.b.astype("object"))
+        else:
+            # de-serialized as plain int / object
+            expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
         check_round_trip(df, pa, expected=expected)
 
         df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
-        # if missing values in integer, currently de-serialized as float
-        expected = df.assign(a=df.a.astype("float64"))
+        if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
+            expected = df
+        else:
+            # if missing values in integer, currently de-serialized as float
+            expected = df.assign(a=df.a.astype("float64"))
         check_round_trip(df, pa, expected=expected)
 
 

From 27dcbfbaae86798cf94eb6a1cdd95604e9fc2d5b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2019 14:35:02 +0100
Subject: [PATCH 2/7] Add __from_arrow__ support for StringArray

---
 pandas/core/arrays/string_.py              | 16 ++++++++++++++++
 pandas/tests/arrays/string_/test_string.py | 14 ++++++++++++++
 pandas/tests/io/test_parquet.py            |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7c487b227de20..c185cc9495005 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -85,6 +85,22 @@ def construct_array_type(cls) -> "Type[StringArray]":
     def __repr__(self) -> str:
         return "StringDtype"
 
+    def __from_arrow__(self, array):
+        """Construct StringArray from passed pyarrow Array"""
+        import pyarrow
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            str_arr = StringArray(np.array(arr))
+            results.append(str_arr)
+
+        return StringArray._concat_same_type(results)
+
 
 class StringArray(PandasArray):
     """
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index efe2b4e0b2deb..843e09ea891eb 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -171,3 +171,17 @@ def test_arrow_array():
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
     assert arr.equals(expected)
+
+
+@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+def test_arrow_roundtrip():
+    # roundtrip possible from arrow 1.0.0
+    import pyarrow as pa
+
+    data = pd.array(["a", "b", "c"], dtype="string")
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    assert table.field("a").type == "string"
+    result = table.to_pandas()
+    assert isinstance(result["a"].dtype, pd.StringDtype)
+    tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 3d5554fe26435..e42904e40768a 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -515,7 +515,7 @@ def test_additional_extension_arrays(self, pa):
             }
         )
         if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
-            expected = df.assign(b=df.b.astype("object"))
+            expected = df
         else:
             # de-serialized as plain int / object
             expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))

From 72f41428ac7f0b529968b7f67805dbc696c8f93c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2019 14:41:05 +0100
Subject: [PATCH 3/7] add whatsnew / docs

---
 doc/source/user_guide/io.rst   | 1 +
 doc/source/whatsnew/v1.0.0.rst | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index ef87b6c57b1b9..756ddda984087 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4717,6 +4717,7 @@ Several caveats.
 * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
+* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data type (requiring pyarrow >= 1.0.0).
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
 If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index cb1d80a34514c..4fc5be83edd94 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -113,6 +113,9 @@ Other enhancements
 - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
 - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
 - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
+- Roundtripping DataFrames with nullable integer or string data types to parquet
+  (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
+  now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
 
 Build Changes
 ^^^^^^^^^^^^^

From 38218adfde25d91687d4ca24619eedb33802b5ee Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 11 Nov 2019 17:19:42 +0100
Subject: [PATCH 4/7] fix None -> np.nan conversion

---
 pandas/core/arrays/string_.py              | 3 ++-
 pandas/tests/arrays/string_/test_string.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c185cc9495005..5009da32654eb 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -96,7 +96,8 @@ def __from_arrow__(self, array):
 
         results = []
         for arr in chunks:
-            str_arr = StringArray(np.array(arr))
+            # using _from_sequence to ensure None is convered to np.nan
+            str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
         return StringArray._concat_same_type(results)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 843e09ea891eb..1ce62d8f8b3d9 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -178,10 +178,12 @@ def test_arrow_roundtrip():
     # roundtrip possible from arrow 1.0.0
     import pyarrow as pa
 
-    data = pd.array(["a", "b", "c"], dtype="string")
+    data = pd.array(["a", "b", None], dtype="string")
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
     assert table.field("a").type == "string"
     result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
     tm.assert_frame_equal(result, df)
+    # ensure the missing value is represented by NaN and not None
+    assert np.isnan(result.loc[2, "a"])

From 0521875b53d7e0d782aa5c9175fa996a0d76b08f Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 11 Nov 2019 17:39:56 +0100
Subject: [PATCH 5/7] better documentation + add comment

---
 doc/source/development/extending.rst | 44 ++++++++++++++++++++++++++++
 doc/source/user_guide/io.rst         |  4 ++-
 pandas/core/arrays/integer.py        |  3 +-
 pandas/core/arrays/string_.py        |  3 +-
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index e341dcb8318bc..d4fd4edf729d8 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -251,6 +251,50 @@ To use a test, subclass it:
 See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py
 for a list of all the tests available.
 
+.. _extending.extension.arrow:
+
+Compatibility with Apache Arrow
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An ``ExtensionArray`` can support conversion to / from ``pyarrow`` arrays
+(and thus support for example serialization to the Parquet file format)
+by implementing two methods: ``ExtensionArray.__arrow_array__`` and
+``ExtensionDtype.__from_arrow__``.
+
+The ``ExtensionArray.__arrow_array__`` ensures that ``pyarrow`` knowns how
+to convert the specific extension array into a ``pyarrow.Array`` (also when
+included as a column in a pandas DataFrame):
+
+.. code-block:: python
+
+    class MyExtensionArray(ExtensionArray):
+        ...
+
+        def __arrow_array__(self, type=None):
+            # convert the underlying array values to a pyarrow Array
+            import pyarrow
+            return pyarrow.array(..., type=type)
+
+The ``ExtensionDtype.__from_arrow__`` method then controls the conversion
+back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow
+``Array`` or ``ChunkedArray`` as only argument and is expected to return the
+appropriate pandas ``ExtensionArray`` for this dtype and the passed values:
+
+.. code-block:: python
+
+    class ExtensionDtype:
+        ...
+
+        def __from_arrow__(
+            self, array: pyarrow.Array/ChunkedArray
+        ) -> ExtensionArray:
+            ...
+
+See more in the `Arrow documentation <https://arrow.apache.org/docs/python/extending_types.html>`__.
+
+Those methods have been implemented for the nullable integer and string extension
+dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format.
+
 .. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py
 .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
 .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 1d44bae46c4e2..519ddbae9ff87 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4717,7 +4717,9 @@ Several caveats.
 * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
-* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data type (requiring pyarrow >= 1.0.0).
+* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data
+  type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols,
+  see the :ref:`extension types documentation <extending.extension.arrow>`).
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
 If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index d841e247e495d..d0669de78a47b 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -86,12 +86,13 @@ def construct_array_type(cls):
         return IntegerArray
 
     def __from_arrow__(self, array):
-        """Construct IntegerArray from passed pyarrow Array"""
+        """Construct IntegerArray from passed pyarrow Array/ChunkedArray"""
         import pyarrow
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]
         else:
+            # pyarrow.ChunkedArray
             chunks = array.chunks
 
         results = []
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 5009da32654eb..8599b5e39f34a 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -86,12 +86,13 @@ def __repr__(self) -> str:
         return "StringDtype"
 
     def __from_arrow__(self, array):
-        """Construct StringArray from passed pyarrow Array"""
+        """Construct StringArray from passed pyarrow Array/ChunkedArray"""
         import pyarrow
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]
         else:
+            # pyarrow.ChunkedArray
             chunks = array.chunks
 
         results = []

From e593f9ef2803368610995812b614c006ad0cee91 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 12 Nov 2019 09:51:04 +0100
Subject: [PATCH 6/7] fix flake8

---
 doc/source/development/extending.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index d4fd4edf729d8..26b4d3d2860c4 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -280,14 +280,12 @@ back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow
 ``Array`` or ``ChunkedArray`` as only argument and is expected to return the
 appropriate pandas ``ExtensionArray`` for this dtype and the passed values:
 
-.. code-block:: python
+.. code-block::
 
     class ExtensionDtype:
         ...
 
-        def __from_arrow__(
-            self, array: pyarrow.Array/ChunkedArray
-        ) -> ExtensionArray:
+        def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray:
             ...
 
 See more in the `Arrow documentation <https://arrow.apache.org/docs/python/extending_types.html>`__.

From c317aab843fd1cf0ffe907c07d9ad7322ae91bb4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 12 Nov 2019 13:09:30 +0100
Subject: [PATCH 7/7] ignore linting

---
 doc/source/development/extending.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index 26b4d3d2860c4..89d43e8a43825 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -280,7 +280,7 @@ back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow
 ``Array`` or ``ChunkedArray`` as only argument and is expected to return the
 appropriate pandas ``ExtensionArray`` for this dtype and the passed values:
 
-.. code-block::
+.. code-block:: none
 
     class ExtensionDtype:
         ...