Skip to content

Commit 45ea153

Browse files
committed
ENH: Read and write pandas attrs to parquet with pyarrow engine
1 parent 20681c9 commit 45ea153

File tree

3 files changed

+53
-2
lines changed

3 files changed

+53
-2
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ Other enhancements
229229
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
230230
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
231231
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
232+
- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`)
232233

233234
.. ---------------------------------------------------------------------------
234235

pandas/io/parquet.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import io
5+
import json
56
import os
67
from typing import (
78
Any,
@@ -154,6 +155,32 @@ def __init__(self):
154155

155156
self.api = pyarrow
156157

158+
@staticmethod
159+
def _write_attrs(table, df: DataFrame):
160+
schema_metadata = table.schema.metadata or {}
161+
pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
162+
column_attrs = {}
163+
for col in df.columns:
164+
attrs = df[col].attrs
165+
if not attrs or not isinstance(col, str):
166+
continue
167+
column_attrs[col] = attrs
168+
pandas_metadata.update(
169+
attrs=df.attrs,
170+
column_attrs=column_attrs,
171+
)
172+
schema_metadata[b"pandas"] = json.dumps(pandas_metadata)
173+
return table.replace_schema_metadata(schema_metadata)
174+
175+
@staticmethod
176+
def _read_attrs(table, df: DataFrame):
177+
schema_metadata = table.schema.metadata or {}
178+
pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
179+
df.attrs = pandas_metadata.get("attrs", {})
180+
col_attrs = pandas_metadata.get("column_attrs", {})
181+
for col, attrs in col_attrs.items():
182+
df[col].attrs = attrs
183+
157184
def write(
158185
self,
159186
df: DataFrame,
@@ -171,6 +198,7 @@ def write(
171198
from_pandas_kwargs["preserve_index"] = index
172199

173200
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
201+
table = self._write_attrs(table, df)
174202

175203
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
176204
path,
@@ -242,9 +270,11 @@ def read(
242270
mode="rb",
243271
)
244272
try:
245-
result = self.api.parquet.read_table(
273+
table = self.api.parquet.read_table(
246274
path_or_handle, columns=columns, **kwargs
247-
).to_pandas(**to_pandas_kwargs)
275+
)
276+
result = table.to_pandas(**to_pandas_kwargs)
277+
self._read_attrs(table, result)
248278
if manager == "array":
249279
result = result._as_manager("array", copy=False)
250280
return result

pandas/tests/io/test_parquet.py

+20
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,26 @@ def test_read_parquet_manager(self, pa, using_array_manager):
907907
else:
908908
assert isinstance(result._mgr, pd.core.internals.BlockManager)
909909

910+
@td.skip_if_no("pyarrow")
911+
def test_read_write_attrs(self, pa):
912+
df = pd.DataFrame({"a": [1]})
913+
df.attrs = {"name": "my custom dataset"}
914+
df.a.attrs = {
915+
"long_name": "Description about data",
916+
"nodata": -1,
917+
"units": "metre",
918+
}
919+
with tm.ensure_clean() as path:
920+
df.to_parquet(path)
921+
result = read_parquet(path)
922+
923+
assert result.attrs == {"name": "my custom dataset"}
924+
assert result["a"].attrs == {
925+
"long_name": "Description about data",
926+
"nodata": -1,
927+
"units": "metre",
928+
}
929+
910930

911931
class TestParquetFastParquet(Base):
912932
def test_basic(self, fp, df_full):

0 commit comments

Comments
 (0)