From 0a069be87306bb746f1b7ff66edc1b7ed366a1d4 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 11 May 2020 23:39:48 +0100 Subject: [PATCH 1/4] BUG/ENH: Correct categorical on iterators Return categoricals with the same categories if possible when reading data through an interator. Warn if not possible. closes #31544 --- doc/source/whatsnew/v1.1.0.rst | 4 ++ pandas/io/stata.py | 53 +++++++++++++++--- .../stata/stata-dta-partially-labeled.dta | Bin 0 -> 1390 bytes pandas/tests/io/test_stata.py | 32 +++++++++++ 4 files changed, 81 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-dta-partially-labeled.dta diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6e8cbc34be062..6857c3e41edd1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -905,6 +905,10 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`) +- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`) +- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) + Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fe8dcf1bdb9aa..c521edcf9a55f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -497,6 +497,21 @@ class InvalidColumnName(Warning): """ +class CategoricalConversionWarning(Warning): + pass + + +categorical_conversion_warning = """ +One or more series with value labels are not fully labeled. Reading this +dataset with an iterator results in categorical variable with different +categories. This occurs since it is not possible to know all possible values +until the entire dataset has been read. To avoid this warning, you can either +read dataset without an interator, or manually convert categorical data by +``convert_categoricals`` to False and then accessing the variable labels +through the value_labels method of the reader. +""" + + def _cast_to_stata_types(data: DataFrame) -> DataFrame: """ Checks the dtypes of the columns of a pandas DataFrame for @@ -1753,8 +1768,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra return data[columns] - @staticmethod def _do_convert_categoricals( + self, data: DataFrame, value_label_dict: Dict[str, Dict[Union[float, int], str]], lbllist: Sequence[str], @@ -1768,14 +1783,36 @@ def _do_convert_categoricals( for col, label in zip(data, lbllist): if label in value_labels: # Explicit call with ordered=True - cat_data = Categorical(data[col], ordered=order_categoricals) - categories = [] - for category in cat_data.categories: - if category in value_label_dict[label]: - categories.append(value_label_dict[label][category]) - else: - categories.append(category) # Partially labeled + vl = value_label_dict[label] + keys = np.array([k for k in vl.keys()]) + column = data[col] + if column.isin(keys).all() and self._chunksize: + # If all categories are in the keys and we are iterating, + # use the same keys for all chunks. If some are missing + # value labels, then we will fall back to the categories + # varying across chunks. + initial_categories = keys + warnings.warn( + categorical_conversion_warning, CategoricalConversionWarning + ) + else: + initial_categories = None + cat_data = Categorical( + column, categories=initial_categories, ordered=order_categoricals + ) + if initial_categories is None: + # If None here, then we need to match the cats in the Categorical + categories = [] + for category in cat_data.categories: + if category in vl: + categories.append(vl[category]) + else: + categories.append(category) + else: + # If all cats are matched, we can use the values + categories = [v for v in vl.values()] try: + # Try to catch duplicate categories cat_data.categories = categories except ValueError as err: vc = Series(categories).value_counts() diff --git a/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta new file mode 100644 index 0000000000000000000000000000000000000000..b9abdb8827432d59ab781a079f96cd466570d483 GIT binary patch literal 1390 zcmd5+y-yTD6yGx#E(Qw>%_SCB2+?4ey>0l&%XkTBb3j8wcg)@ma>*=rnHfl~($3D( z(u%^uKf*u2ih|A|9=A|LW2K9Byf9@Y7_81qnl3{M*0V?UIy-iUQ;H~s<~DfAA67CkGqTe?BdaA3DI zefP+?*}s3~^0!NDw}Ax{7^{_!M7*(HPOEaopSsBKo8e%dAX93bE(X;lp#OBmpBk=S k1P!K+{&b<_?;jP?`Lox16iMzMI>*#cXtIZ~CwgM90h9mgy8r+H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 698b5417b471b..9dce2fd9f15ed 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -20,6 +20,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import ( + CategoricalConversionWarning, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, @@ -1923,3 +1924,34 @@ def test_compression_dict(method, file_ext): fp = path reread = read_stata(fp, index_col="index") tm.assert_frame_equal(reread, df) + + +@pytest.mark.parametrize("version", [114, 117, 118, 119, None]) +def test_chunked_categorical(version): + df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")}) + df.index.name = "index" + with tm.ensure_clean() as path: + df.to_stata(path, version=version) + reader = StataReader(path, chunksize=2, order_categoricals=False) + for i, block in enumerate(reader): + block = block.set_index("index") + assert "cats" in block + tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)]) + + +def test_chunked_categorical_partial(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + reader = StataReader(dta_file, chunksize=2) + values = ["a", "b", "a", "b", 3.0] + with pytest.warns(CategoricalConversionWarning, match="One or more series"): + for i, block in enumerate(reader): + assert list(block.cats) == values[2 * i : 2 * (i + 1)] + if i < 2: + idx = pd.Index(["a", "b"]) + else: + idx = pd.Float64Index([3.0]) + tm.assert_index_equal(block.cats.cat.categories, idx) + reader = StataReader(dta_file, chunksize=5) + large_chunk = reader.__next__() + direct = read_stata(dta_file) + tm.assert_frame_equal(direct, large_chunk) From 5bc23129a65d15e26b29d24dc67ca63044484f96 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 12 May 2020 10:18:34 +0100 Subject: [PATCH 2/4] MAINT: Restrict use of iterator Restrict iterator to StataReaders constructed with a positive chunksize --- pandas/io/stata.py | 10 +++++++++- pandas/tests/io/test_stata.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c521edcf9a55f..59c4aeeeec325 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1038,6 +1038,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize + if self._chunksize is not None and ( + not isinstance(chunksize, int) or chunksize <= 0 + ): + raise ValueError("chunksize must be a positive integer when set.") # State variables for the file self._has_string_data = False @@ -1503,6 +1507,10 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: + if self._chunksize is None: + raise ValueError( + "chunksize must be set to a positive integer to use as an iterator." + ) return self.read(nrows=self._chunksize or 1) def get_chunk(self, size: Optional[int] = None) -> DataFrame: @@ -1786,7 +1794,7 @@ def _do_convert_categoricals( vl = value_label_dict[label] keys = np.array([k for k in vl.keys()]) column = data[col] - if column.isin(keys).all() and self._chunksize: + if self._chunksize is not None and column.isin(keys).all(): # If all categories are in the keys and we are iterating, # use the same keys for all chunks. If some are missing # value labels, then we will fall back to the categories diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9dce2fd9f15ed..33f616cfc768f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1955,3 +1955,15 @@ def test_chunked_categorical_partial(dirpath): large_chunk = reader.__next__() direct = read_stata(dta_file) tm.assert_frame_equal(direct, large_chunk) + + +def test_iterator_errors(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=-1) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=0) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize="apple") + with pytest.raises(ValueError, match="chunksize must be set to a positive"): + StataReader(dta_file).__next__() From 85d0d892525865325a83b941e33ce593001f3533 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 12 May 2020 12:56:57 +0100 Subject: [PATCH 3/4] CLN: Remove unwanted patterns --- pandas/io/stata.py | 17 ++++++++++------- pandas/tests/io/test_stata.py | 26 ++++++++++++++------------ 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59c4aeeeec325..ef82e8633610b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1792,18 +1792,21 @@ def _do_convert_categoricals( if label in value_labels: # Explicit call with ordered=True vl = value_label_dict[label] - keys = np.array([k for k in vl.keys()]) + keys = np.array(list(vl.keys())) column = data[col] - if self._chunksize is not None and column.isin(keys).all(): + key_matches = column.isin(keys) + if self._chunksize is not None and key_matches.all(): + initial_categories = keys # If all categories are in the keys and we are iterating, # use the same keys for all chunks. If some are missing # value labels, then we will fall back to the categories # varying across chunks. - initial_categories = keys - warnings.warn( - categorical_conversion_warning, CategoricalConversionWarning - ) else: + if self._chunksize is not None: + # warn is using an iterator + warnings.warn( + categorical_conversion_warning, CategoricalConversionWarning + ) initial_categories = None cat_data = Categorical( column, categories=initial_categories, ordered=order_categoricals @@ -1818,7 +1821,7 @@ def _do_convert_categoricals( categories.append(category) else: # If all cats are matched, we can use the values - categories = [v for v in vl.values()] + categories = list(vl.values()) try: # Try to catch duplicate categories cat_data.categories = categories diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 33f616cfc768f..87441f238ac51 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1941,18 +1941,19 @@ def test_chunked_categorical(version): def test_chunked_categorical_partial(dirpath): dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") - reader = StataReader(dta_file, chunksize=2) values = ["a", "b", "a", "b", 3.0] - with pytest.warns(CategoricalConversionWarning, match="One or more series"): - for i, block in enumerate(reader): - assert list(block.cats) == values[2 * i : 2 * (i + 1)] - if i < 2: - idx = pd.Index(["a", "b"]) - else: - idx = pd.Float64Index([3.0]) - tm.assert_index_equal(block.cats.cat.categories, idx) - reader = StataReader(dta_file, chunksize=5) - large_chunk = reader.__next__() + with StataReader(dta_file, chunksize=2) as reader: + with tm.assert_produces_warning(CategoricalConversionWarning): + for i, block in enumerate(reader): + assert list(block.cats) == values[2 * i : 2 * (i + 1)] + if i < 2: + idx = pd.Index(["a", "b"]) + else: + idx = pd.Float64Index([3.0]) + tm.assert_index_equal(block.cats.cat.categories, idx) + with tm.assert_produces_warning(CategoricalConversionWarning): + with StataReader(dta_file, chunksize=5) as reader: + large_chunk = reader.__next__() direct = read_stata(dta_file) tm.assert_frame_equal(direct, large_chunk) @@ -1966,4 +1967,5 @@ def test_iterator_errors(dirpath): with pytest.raises(ValueError, match="chunksize must be a positive"): StataReader(dta_file, chunksize="apple") with pytest.raises(ValueError, match="chunksize must be set to a positive"): - StataReader(dta_file).__next__() + with StataReader(dta_file) as reader: + reader.__next__() From fa173aa4d58bd2d677cfc3f497bf9ef011279aa7 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 12 May 2020 17:58:01 +0100 Subject: [PATCH 4/4] TST: Add a test to check order invariance Check the label ordering does not cause any issues --- doc/source/whatsnew/v1.1.0.rst | 3 --- pandas/io/stata.py | 12 ++++++++++++ pandas/tests/io/test_stata.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6857c3e41edd1..d11c3699dc86d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -905,11 +905,8 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`) -- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ef82e8633610b..e9adf5292ef6f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -106,6 +106,14 @@ iterator : bool, default False Return StataReader object.""" +_reader_notes = """\ +Notes +----- +Categorical variables read through an iterator may not have the same +categories and dtype. This occurs when a variable stored in a DTA +file is associated to an incomplete set of value labels that only +label a strict subset of the values.""" + _read_stata_doc = f""" Read Stata file into DataFrame. @@ -135,6 +143,8 @@ io.stata.StataReader : Low-level reader for Stata data files. DataFrame.to_stata: Export Stata data files. +{_reader_notes} + Examples -------- Read a Stata dta file: @@ -176,6 +186,8 @@ {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} + +{_reader_notes} """ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 87441f238ac51..aa3aa61bbb984 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1969,3 +1969,17 @@ def test_iterator_errors(dirpath): with pytest.raises(ValueError, match="chunksize must be set to a positive"): with StataReader(dta_file) as reader: reader.__next__() + + +def test_iterator_value_labels(): + # GH 31544 + values = ["c_label", "b_label"] + ["a_label"] * 500 + df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) + with tm.ensure_clean() as path: + df.to_stata(path, write_index=False) + reader = pd.read_stata(path, chunksize=100) + expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])