Skip to content

ENH: Include df.attrs metadata in to_csv output #53816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
11 changes: 11 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3684,6 +3684,7 @@ def to_csv(
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
comment: str | None = ...,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this string and not bool?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was intended to mirror the read_csv parameter for consistency between the two. Setting it will write the comments with that prefix.

) -> str:
...

Expand Down Expand Up @@ -3711,6 +3712,7 @@ def to_csv(
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
comment: str | None = ...,
) -> None:
...

Expand Down Expand Up @@ -3742,6 +3744,7 @@ def to_csv(
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
comment: str | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Expand Down Expand Up @@ -3847,6 +3850,13 @@ def to_csv(

.. versionadded:: 1.2.0

comment : str, default None
If set the key and values of df.attrs will be written to the
beginning of the csv file, prefixed by this value, each key/value
pair to a single ling. To prevent downstream reading issues
this char will be removed from the df.attrs if present.
Complement of pd.read_csv's 'comment' param.

Returns
-------
None or str
Expand Down Expand Up @@ -3913,6 +3923,7 @@ def to_csv(
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
comment=comment,
)

# ----------------------------------------------------------------------
Expand Down
11 changes: 11 additions & 0 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __init__(
doublequote: bool = True,
escapechar: str | None = None,
storage_options: StorageOptions | None = None,
comment: str | None = None,
) -> None:
self.fmt = formatter

Expand All @@ -94,6 +95,7 @@ def __init__(
self.date_format = date_format
self.cols = self._initialize_columns(cols)
self.chunksize = self._initialize_chunksize(chunksize)
self.comment = comment

@property
def na_rep(self) -> str:
Expand Down Expand Up @@ -265,6 +267,8 @@ def save(self) -> None:
self._save()

def _save(self) -> None:
if self.comment:
self._save_df_attrs()
if self._need_to_save_header:
self._save_header()
self._save_body()
Expand Down Expand Up @@ -323,3 +327,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
self.cols,
self.writer,
)

def _save_df_attrs(self) -> None:
for key, value in self.fmt.frame.attrs.items():
# remove the delimiter from the attr string values
key = str(key).replace(self.writer.dialect.delimiter, "")
value = str(value).replace(self.writer.dialect.delimiter, "")
self.writer.writerow([f"{self.comment}{key}:{value}"])
2 changes: 2 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,7 @@ def to_csv(
escapechar: str | None = None,
errors: str = "strict",
storage_options: StorageOptions | None = None,
comment: str | None = None,
) -> str | None:
"""
Render dataframe as comma-separated file.
Expand Down Expand Up @@ -1148,6 +1149,7 @@ def to_csv(
escapechar=escapechar,
storage_options=storage_options,
formatter=self.fmt,
comment=comment,
)
csv_formatter.save()

Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from io import StringIO

import numpy as np
import pytest

from pandas import (
DataFrame,
NaT,
date_range,
read_csv,
)
import pandas._testing as tm

Expand Down Expand Up @@ -259,3 +262,26 @@ def frame_of_index_cols():
}
)
return df


@pytest.fixture
def comments_attrs():
return {
"one": "Hello",
"two": "Hello World",
"three": "Hello, World!",
"four,": "comma in keym",
}


@pytest.fixture
def data_for_comments_raw():
data = "col1,col2,col3\n0,0,0\n1,1,1\n2,2,2\n"
return data


@pytest.fixture
def frame_for_comments(data_for_comments_raw, comments_attrs):
df = read_csv(StringIO(data_for_comments_raw))
df.attrs = comments_attrs
return df
62 changes: 62 additions & 0 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,3 +1310,65 @@ def test_to_csv_categorical_and_interval(self):
expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"']
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected

def prepate_string_rep_of_comment_output(
self, delim: str, comments_attrs, data_for_comments_raw, frame_for_comments
) -> str:
comment = "#"

data_for_comments_raw = data_for_comments_raw.replace(",", delim)
# Create string representation of data with attrs written at start
output_data_rows = []
for k, v in comments_attrs.items():
# Make sure delims being used are sanitized from comment lines
k = k.replace(delim, "")
v = v.replace(delim, "")
output_data_rows.append(f"{comment}{k}:{v}\n")
output_data = "".join(output_data_rows)
output_data = output_data + data_for_comments_raw
return output_data

def test_comment_writer_csv(
self, comments_attrs, data_for_comments_raw, frame_for_comments
):
comment = "#"
delim = ","
output_data = self.prepate_string_rep_of_comment_output(
delim, comments_attrs, data_for_comments_raw, frame_for_comments
)
read_output = read_csv(StringIO(output_data), comment=comment)

# Check output data can be read correctly
tm.assert_frame_equal(
read_output, frame_for_comments
), "Frame read from test data did not match expected results."

# Check saved output is as expected
with tm.ensure_clean() as path:
frame_for_comments.to_csv(path, comment=comment, index=False)
with open(path, encoding="utf-8") as fp:
lines = fp.read()
assert (
lines == output_data
), "csv output with comment lines not as expected"

def test_comment_writer_tabs(
self, comments_attrs, data_for_comments_raw, frame_for_comments
):
comment = "#"
delim = "\t"
output_data = self.prepate_string_rep_of_comment_output(
delim, comments_attrs, data_for_comments_raw, frame_for_comments
)
read_output = read_csv(StringIO(output_data), comment=comment, sep="\t")

tm.assert_frame_equal(
read_output, frame_for_comments
), "Read tab outputs are not as expected"
with tm.ensure_clean() as path:
frame_for_comments.to_csv(path, comment=comment, index=False, sep="\t")
with open(path, encoding="utf-8") as fp:
lines = fp.read()
assert (
lines == output_data
), "tsv output with comment lines not as expected"