ENH: Read and write pandas attrs to parquet with pyarrow engine

snowman2 · snowman2 · commit 45ea153b3cec · 2021-05-18T19:05:29.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -229,6 +229,7 @@ Other enhancements
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
+- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import io
+import json
 import os
 from typing import (
     Any,
@@ -154,6 +155,32 @@ def __init__(self):
 
         self.api = pyarrow
 
+    @staticmethod
+    def _write_attrs(table, df: DataFrame):
+        schema_metadata = table.schema.metadata or {}
+        pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
+        column_attrs = {}
+        for col in df.columns:
+            attrs = df[col].attrs
+            if not attrs or not isinstance(col, str):
+                continue
+            column_attrs[col] = attrs
+        pandas_metadata.update(
+            attrs=df.attrs,
+            column_attrs=column_attrs,
+        )
+        schema_metadata[b"pandas"] = json.dumps(pandas_metadata)
+        return table.replace_schema_metadata(schema_metadata)
+
+    @staticmethod
+    def _read_attrs(table, df: DataFrame):
+        schema_metadata = table.schema.metadata or {}
+        pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
+        df.attrs = pandas_metadata.get("attrs", {})
+        col_attrs = pandas_metadata.get("column_attrs", {})
+        for col, attrs in col_attrs.items():
+            df[col].attrs = attrs
+
     def write(
         self,
         df: DataFrame,
@@ -171,6 +198,7 @@ def write(
             from_pandas_kwargs["preserve_index"] = index
 
         table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+        table = self._write_attrs(table, df)
 
         path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
             path,
@@ -242,9 +270,11 @@ def read(
             mode="rb",
         )
         try:
-            result = self.api.parquet.read_table(
+            table = self.api.parquet.read_table(
                 path_or_handle, columns=columns, **kwargs
-            ).to_pandas(**to_pandas_kwargs)
+            )
+            result = table.to_pandas(**to_pandas_kwargs)
+            self._read_attrs(table, result)
             if manager == "array":
                 result = result._as_manager("array", copy=False)
             return result
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -907,6 +907,26 @@ def test_read_parquet_manager(self, pa, using_array_manager):
         else:
             assert isinstance(result._mgr, pd.core.internals.BlockManager)
 
+    @td.skip_if_no("pyarrow")
+    def test_read_write_attrs(self, pa):
+        df = pd.DataFrame({"a": [1]})
+        df.attrs = {"name": "my custom dataset"}
+        df.a.attrs = {
+            "long_name": "Description about data",
+            "nodata": -1,
+            "units": "metre",
+        }
+        with tm.ensure_clean() as path:
+            df.to_parquet(path)
+            result = read_parquet(path)
+
+        assert result.attrs == {"name": "my custom dataset"}
+        assert result["a"].attrs == {
+            "long_name": "Description about data",
+            "nodata": -1,
+            "units": "metre",
+        }
+
 
 class TestParquetFastParquet(Base):
     def test_basic(self, fp, df_full):