From 09ba998de1d890cf69bb2ba4bf6e0488ead91148 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 10 Oct 2023 17:36:44 +0800 Subject: [PATCH 1/3] EHN: read_spss stores the metadata in df.attrs --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/spss.py | 3 ++- pandas/tests/io/test_spss.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index cfcc99bf5bda0..1d62a748a5fbb 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 58487c6cd721b..db31a07df79e6 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -63,9 +63,10 @@ def read_spss( raise TypeError("usecols must be list-like.") usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav( + df, metadata = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) + df.attrs = metadata.__dict__ if dtype_backend is not lib.no_default: df = df.convert_dtypes(dtype_backend=dtype_backend) return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 5d8dce72d69c9..a99d7fbf0adcd 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -116,3 +116,33 @@ def test_invalid_dtype_backend(): ) with pytest.raises(ValueError, match=msg): pd.read_spss("test", dtype_backend="numpy") + + +def test_spss_metadata(datapath): + # GH 54264 + fname = datapath("io", "data", "spss", "labelled-num.sav") + + df = pd.read_spss(fname) + assert df.attrs == { + "column_names": ["VAR00002"], + "column_labels": [None], + "column_names_to_labels": {"VAR00002": None}, + "file_encoding": "UTF-8", + "number_columns": 1, + "number_rows": 1, + "variable_value_labels": {"VAR00002": {1.0: "This is one"}}, + "value_labels": {"labels0": {1.0: "This is one"}}, + "variable_to_label": {"VAR00002": "labels0"}, + "notes": [], + "original_variable_types": {"VAR00002": "F8.0"}, + "readstat_variable_types": {"VAR00002": "double"}, + "table_name": None, + "missing_ranges": {}, + "missing_user_values": {}, + "variable_storage_width": {"VAR00002": 8}, + "variable_display_width": {"VAR00002": 8}, + "variable_alignment": {"VAR00002": "unknown"}, + "variable_measure": {"VAR00002": "unknown"}, + "file_label": None, + "file_format": "sav/zsav", + } From f47394ca074bbc055c48d043599a87af00358954 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Tue, 10 Oct 2023 19:23:31 +0800 Subject: [PATCH 2/3] filter warning --- pandas/tests/io/test_spss.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index a99d7fbf0adcd..a916d7ea568b5 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -118,6 +118,7 @@ def test_invalid_dtype_backend(): pd.read_spss("test", dtype_backend="numpy") +@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") def test_spss_metadata(datapath): # GH 54264 fname = datapath("io", "data", "spss", "labelled-num.sav") From 0c584b187e087f6a77a93046500eaa6a45a4242a Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Wed, 11 Oct 2023 00:10:57 +0800 Subject: [PATCH 3/3] Make separate variable --- pandas/tests/io/test_spss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index a916d7ea568b5..b612b64e3b020 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -124,7 +124,7 @@ def test_spss_metadata(datapath): fname = datapath("io", "data", "spss", "labelled-num.sav") df = pd.read_spss(fname) - assert df.attrs == { + metadata = { "column_names": ["VAR00002"], "column_labels": [None], "column_names_to_labels": {"VAR00002": None}, @@ -147,3 +147,4 @@ def test_spss_metadata(datapath): "file_label": None, "file_format": "sav/zsav", } + assert df.attrs == metadata