Skip to content

Commit b311a62

Browse files
author
Alan D. Snow
committed
ENH: Read and write pandas attrs to parquet with pyarrow engine
1 parent 09f3bf8 commit b311a62

File tree

3 files changed

+52
-3
lines changed

3 files changed

+52
-3
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ Other enhancements
228228
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
229229
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
230230
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
231-
-
231+
- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`)
232232

233233
.. ---------------------------------------------------------------------------
234234

pandas/io/parquet.py

+31-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import io
5+
import json
56
import os
67
from typing import (
78
Any,
@@ -154,6 +155,31 @@ def __init__(self):
154155

155156
self.api = pyarrow
156157

158+
@staticmethod
159+
def _write_attrs(table, df: DataFrame):
160+
schema_metadata = dict(table.schema.metadata or {})
161+
pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
162+
column_attrs = {}
163+
for col in df.columns:
164+
attrs = df[col].attrs
165+
if not attrs or not isinstance(col, str):
166+
continue
167+
column_attrs[col] = attrs
168+
pandas_metadata.update(
169+
attrs=df.attrs,
170+
column_attrs=column_attrs,
171+
)
172+
schema_metadata[b"pandas"] = json.dumps(pandas_metadata)
173+
return table.replace_schema_metadata(schema_metadata)
174+
175+
@staticmethod
176+
def _read_attrs(table, df: DataFrame):
177+
pandas_metadata = json.loads(table.schema.metadata[b"pandas"])
178+
df.attrs = pandas_metadata.get("attrs", {})
179+
col_attrs = pandas_metadata.get("column_attrs", {})
180+
for col, attrs in col_attrs.items():
181+
df[col].attrs = attrs
182+
157183
def write(
158184
self,
159185
df: DataFrame,
@@ -171,6 +197,7 @@ def write(
171197
from_pandas_kwargs["preserve_index"] = index
172198

173199
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
200+
table = self._write_attrs(table, df)
174201

175202
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
176203
path,
@@ -242,9 +269,11 @@ def read(
242269
mode="rb",
243270
)
244271
try:
245-
result = self.api.parquet.read_table(
272+
table = self.api.parquet.read_table(
246273
path_or_handle, columns=columns, **kwargs
247-
).to_pandas(**to_pandas_kwargs)
274+
)
275+
result = table.to_pandas(**to_pandas_kwargs)
276+
self._read_attrs(table, result)
248277
if manager == "array":
249278
result = result._as_manager("array", copy=False)
250279
return result

pandas/tests/io/test_parquet.py

+20
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,26 @@ def test_read_parquet_manager(self, pa, using_array_manager):
907907
else:
908908
assert isinstance(result._mgr, pd.core.internals.BlockManager)
909909

910+
@td.skip_if_no("pyarrow")
911+
def test_read_write_attrs(self, pa):
912+
pdf = pd.DataFrame({"a": [1]})
913+
pdf.attrs = {"name": "my custom dataset"}
914+
pdf.a.attrs = {
915+
"long_name": "Description about data",
916+
"nodata": -1,
917+
"units": "metre",
918+
}
919+
with tm.ensure_clean() as path:
920+
pdf.to_parquet(path)
921+
result = read_parquet(path)
922+
923+
assert result.attrs == {"name": "my custom dataset"}
924+
assert result["a"].attrs == {
925+
"long_name": "Description about data",
926+
"nodata": -1,
927+
"units": "metre",
928+
}
929+
910930

911931
class TestParquetFastParquet(Base):
912932
def test_basic(self, fp, df_full):

0 commit comments

Comments
 (0)