From 0a069be87306bb746f1b7ff66edc1b7ed366a1d4 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.sheppard@gmail.com>
Date: Mon, 11 May 2020 23:39:48 +0100
Subject: [PATCH 1/4] BUG/ENH: Correct categorical on iterators

Return categoricals with the same categories if possible when reading
data through an interator.
Warn if not possible.

closes #31544
---
 doc/source/whatsnew/v1.1.0.rst                |   4 ++
 pandas/io/stata.py                            |  53 +++++++++++++++---
 .../stata/stata-dta-partially-labeled.dta     | Bin 0 -> 1390 bytes
 pandas/tests/io/test_stata.py                 |  32 +++++++++++
 4 files changed, 81 insertions(+), 8 deletions(-)
 create mode 100644 pandas/tests/io/data/stata/stata-dta-partially-labeled.dta

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 6e8cbc34be062..6857c3e41edd1 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -905,6 +905,10 @@ I/O
 - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
 - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`)
 - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
+- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`)
+- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`)
+- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
+
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index fe8dcf1bdb9aa..c521edcf9a55f 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -497,6 +497,21 @@ class InvalidColumnName(Warning):
 """
 
 
+class CategoricalConversionWarning(Warning):
+    pass
+
+
+categorical_conversion_warning = """
+One or more series with value labels are not fully labeled. Reading this
+dataset with an iterator results in categorical variable with different
+categories. This occurs since it is not possible to know all possible values
+until the entire dataset has been read. To avoid this warning, you can either
+read dataset without an interator, or manually convert categorical data by
+``convert_categoricals`` to False and then accessing the variable labels
+through the value_labels method of the reader.
+"""
+
+
 def _cast_to_stata_types(data: DataFrame) -> DataFrame:
     """
     Checks the dtypes of the columns of a pandas DataFrame for
@@ -1753,8 +1768,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
 
         return data[columns]
 
-    @staticmethod
     def _do_convert_categoricals(
+        self,
         data: DataFrame,
         value_label_dict: Dict[str, Dict[Union[float, int], str]],
         lbllist: Sequence[str],
@@ -1768,14 +1783,36 @@ def _do_convert_categoricals(
         for col, label in zip(data, lbllist):
             if label in value_labels:
                 # Explicit call with ordered=True
-                cat_data = Categorical(data[col], ordered=order_categoricals)
-                categories = []
-                for category in cat_data.categories:
-                    if category in value_label_dict[label]:
-                        categories.append(value_label_dict[label][category])
-                    else:
-                        categories.append(category)  # Partially labeled
+                vl = value_label_dict[label]
+                keys = np.array([k for k in vl.keys()])
+                column = data[col]
+                if column.isin(keys).all() and self._chunksize:
+                    # If all categories are in the keys and we are iterating,
+                    # use the same keys for all chunks. If some are missing
+                    # value labels, then we will fall back to the categories
+                    # varying across chunks.
+                    initial_categories = keys
+                    warnings.warn(
+                        categorical_conversion_warning, CategoricalConversionWarning
+                    )
+                else:
+                    initial_categories = None
+                cat_data = Categorical(
+                    column, categories=initial_categories, ordered=order_categoricals
+                )
+                if initial_categories is None:
+                    # If None here, then we need to match the cats in the Categorical
+                    categories = []
+                    for category in cat_data.categories:
+                        if category in vl:
+                            categories.append(vl[category])
+                        else:
+                            categories.append(category)
+                else:
+                    # If all cats are matched, we can use the values
+                    categories = [v for v in vl.values()]
                 try:
+                    # Try to catch duplicate categories
                     cat_data.categories = categories
                 except ValueError as err:
                     vc = Series(categories).value_counts()
diff --git a/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta
new file mode 100644
index 0000000000000000000000000000000000000000..b9abdb8827432d59ab781a079f96cd466570d483
GIT binary patch
literal 1390
zcmd5+y-yTD6yGx#E(Qw>%_SCB2+?4ey>0l&%XkTBb3j8wcg)@ma>*=rnHfl~($3D(
z(u%^uKf*u2ih|A<C@8SBu$1|D@0Ki$p}}o_^S*!Y&D#NEg%#bt6&wzv=u6E(OC^Po
zywzF(MgrtxD~H-UZ0@`PR*KN!wFFp)!#01I#1Ggu2PJZ;c;amF1a?qJSFafH*;dxv
z660o;W@$5BT}v}yr4<-aDa6Pnh~H<#MlBuy@f7hF;_`!N|B1Lh73F=z4~U--b0mF;
z2+UW7W1$BkSF&rzBWd`bzrZRg?{X+yOS~tnL3Z~8fm`GPW~i+i7#ju$Y$C$`P#2CU
zpI=={-`<5MFiPQCS4XlN?zQ_L&7exEyv0D(OqQuORe$V9fIE=j?o9CfWWMcnyC-L}
zVN@?au6tQ!zc)PoxcD0EdzbHm`Po(Ur+gGQ7p!F-s#DERddio)h;=A_qgV*_zP#?t
z_>|9=A|LW2K9Byf9@Y7_81qnl3{M*0V?UIy-iUQ;H~s<~DfAA67CkGqTe?BdaA3DI
zefP+?*}s3~^0!NDw}Ax{7^{_!M7*(HPOEaopSsBKo8e%dAX93bE(X;lp#OBmpBk=S
k1P!K+{&b<_?;jP?`Lox16iMzMI>*#cXtIZ~CwgM90h9mgy8r+H

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 698b5417b471b..9dce2fd9f15ed 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -20,6 +20,7 @@
 
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (
+    CategoricalConversionWarning,
     InvalidColumnName,
     PossiblePrecisionLoss,
     StataMissingValue,
@@ -1923,3 +1924,34 @@ def test_compression_dict(method, file_ext):
             fp = path
         reread = read_stata(fp, index_col="index")
         tm.assert_frame_equal(reread, df)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_chunked_categorical(version):
+    df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")})
+    df.index.name = "index"
+    with tm.ensure_clean() as path:
+        df.to_stata(path, version=version)
+        reader = StataReader(path, chunksize=2, order_categoricals=False)
+        for i, block in enumerate(reader):
+            block = block.set_index("index")
+            assert "cats" in block
+            tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)])
+
+
+def test_chunked_categorical_partial(dirpath):
+    dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
+    reader = StataReader(dta_file, chunksize=2)
+    values = ["a", "b", "a", "b", 3.0]
+    with pytest.warns(CategoricalConversionWarning, match="One or more series"):
+        for i, block in enumerate(reader):
+            assert list(block.cats) == values[2 * i : 2 * (i + 1)]
+            if i < 2:
+                idx = pd.Index(["a", "b"])
+            else:
+                idx = pd.Float64Index([3.0])
+            tm.assert_index_equal(block.cats.cat.categories, idx)
+    reader = StataReader(dta_file, chunksize=5)
+    large_chunk = reader.__next__()
+    direct = read_stata(dta_file)
+    tm.assert_frame_equal(direct, large_chunk)

From 5bc23129a65d15e26b29d24dc67ca63044484f96 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Tue, 12 May 2020 10:18:34 +0100
Subject: [PATCH 2/4] MAINT: Restrict use of iterator

Restrict iterator to StataReaders constructed with a positive chunksize
---
 pandas/io/stata.py            | 10 +++++++++-
 pandas/tests/io/test_stata.py | 12 ++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index c521edcf9a55f..59c4aeeeec325 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1038,6 +1038,10 @@ def __init__(
         self._order_categoricals = order_categoricals
         self._encoding = ""
         self._chunksize = chunksize
+        if self._chunksize is not None and (
+            not isinstance(chunksize, int) or chunksize <= 0
+        ):
+            raise ValueError("chunksize must be a positive integer when set.")
 
         # State variables for the file
         self._has_string_data = False
@@ -1503,6 +1507,10 @@ def _read_strls(self) -> None:
             self.GSO[str(v_o)] = decoded_va
 
     def __next__(self) -> DataFrame:
+        if self._chunksize is None:
+            raise ValueError(
+                "chunksize must be set to a positive integer to use as an iterator."
+            )
         return self.read(nrows=self._chunksize or 1)
 
     def get_chunk(self, size: Optional[int] = None) -> DataFrame:
@@ -1786,7 +1794,7 @@ def _do_convert_categoricals(
                 vl = value_label_dict[label]
                 keys = np.array([k for k in vl.keys()])
                 column = data[col]
-                if column.isin(keys).all() and self._chunksize:
+                if self._chunksize is not None and column.isin(keys).all():
                     # If all categories are in the keys and we are iterating,
                     # use the same keys for all chunks. If some are missing
                     # value labels, then we will fall back to the categories
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 9dce2fd9f15ed..33f616cfc768f 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1955,3 +1955,15 @@ def test_chunked_categorical_partial(dirpath):
     large_chunk = reader.__next__()
     direct = read_stata(dta_file)
     tm.assert_frame_equal(direct, large_chunk)
+
+
+def test_iterator_errors(dirpath):
+    dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize=-1)
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize=0)
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize="apple")
+    with pytest.raises(ValueError, match="chunksize must be set to a positive"):
+        StataReader(dta_file).__next__()

From 85d0d892525865325a83b941e33ce593001f3533 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Tue, 12 May 2020 12:56:57 +0100
Subject: [PATCH 3/4] CLN: Remove unwanted patterns

---
 pandas/io/stata.py            | 17 ++++++++++-------
 pandas/tests/io/test_stata.py | 26 ++++++++++++++------------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 59c4aeeeec325..ef82e8633610b 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1792,18 +1792,21 @@ def _do_convert_categoricals(
             if label in value_labels:
                 # Explicit call with ordered=True
                 vl = value_label_dict[label]
-                keys = np.array([k for k in vl.keys()])
+                keys = np.array(list(vl.keys()))
                 column = data[col]
-                if self._chunksize is not None and column.isin(keys).all():
+                key_matches = column.isin(keys)
+                if self._chunksize is not None and key_matches.all():
+                    initial_categories = keys
                     # If all categories are in the keys and we are iterating,
                     # use the same keys for all chunks. If some are missing
                     # value labels, then we will fall back to the categories
                     # varying across chunks.
-                    initial_categories = keys
-                    warnings.warn(
-                        categorical_conversion_warning, CategoricalConversionWarning
-                    )
                 else:
+                    if self._chunksize is not None:
+                        # warn is using an iterator
+                        warnings.warn(
+                            categorical_conversion_warning, CategoricalConversionWarning
+                        )
                     initial_categories = None
                 cat_data = Categorical(
                     column, categories=initial_categories, ordered=order_categoricals
@@ -1818,7 +1821,7 @@ def _do_convert_categoricals(
                             categories.append(category)
                 else:
                     # If all cats are matched, we can use the values
-                    categories = [v for v in vl.values()]
+                    categories = list(vl.values())
                 try:
                     # Try to catch duplicate categories
                     cat_data.categories = categories
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 33f616cfc768f..87441f238ac51 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1941,18 +1941,19 @@ def test_chunked_categorical(version):
 
 def test_chunked_categorical_partial(dirpath):
     dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
-    reader = StataReader(dta_file, chunksize=2)
     values = ["a", "b", "a", "b", 3.0]
-    with pytest.warns(CategoricalConversionWarning, match="One or more series"):
-        for i, block in enumerate(reader):
-            assert list(block.cats) == values[2 * i : 2 * (i + 1)]
-            if i < 2:
-                idx = pd.Index(["a", "b"])
-            else:
-                idx = pd.Float64Index([3.0])
-            tm.assert_index_equal(block.cats.cat.categories, idx)
-    reader = StataReader(dta_file, chunksize=5)
-    large_chunk = reader.__next__()
+    with StataReader(dta_file, chunksize=2) as reader:
+        with tm.assert_produces_warning(CategoricalConversionWarning):
+            for i, block in enumerate(reader):
+                assert list(block.cats) == values[2 * i : 2 * (i + 1)]
+                if i < 2:
+                    idx = pd.Index(["a", "b"])
+                else:
+                    idx = pd.Float64Index([3.0])
+                tm.assert_index_equal(block.cats.cat.categories, idx)
+    with tm.assert_produces_warning(CategoricalConversionWarning):
+        with StataReader(dta_file, chunksize=5) as reader:
+            large_chunk = reader.__next__()
     direct = read_stata(dta_file)
     tm.assert_frame_equal(direct, large_chunk)
 
@@ -1966,4 +1967,5 @@ def test_iterator_errors(dirpath):
     with pytest.raises(ValueError, match="chunksize must be a positive"):
         StataReader(dta_file, chunksize="apple")
     with pytest.raises(ValueError, match="chunksize must be set to a positive"):
-        StataReader(dta_file).__next__()
+        with StataReader(dta_file) as reader:
+            reader.__next__()

From fa173aa4d58bd2d677cfc3f497bf9ef011279aa7 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Tue, 12 May 2020 17:58:01 +0100
Subject: [PATCH 4/4] TST: Add a test to check order invariance

Check the label ordering does not cause any issues
---
 doc/source/whatsnew/v1.1.0.rst |  3 ---
 pandas/io/stata.py             | 12 ++++++++++++
 pandas/tests/io/test_stata.py  | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 6857c3e41edd1..d11c3699dc86d 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -905,11 +905,8 @@ I/O
 - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
 - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`)
 - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
-- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`)
-- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`)
 - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
 
-
 Plotting
 ^^^^^^^^
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index ef82e8633610b..e9adf5292ef6f 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -106,6 +106,14 @@
 iterator : bool, default False
     Return StataReader object."""
 
+_reader_notes = """\
+Notes
+-----
+Categorical variables read through an iterator may not have the same
+categories and dtype. This occurs when  a variable stored in a DTA
+file is associated to an incomplete set of value labels that only
+label a strict subset of the values."""
+
 _read_stata_doc = f"""
 Read Stata file into DataFrame.
 
@@ -135,6 +143,8 @@
 io.stata.StataReader : Low-level reader for Stata data files.
 DataFrame.to_stata: Export Stata data files.
 
+{_reader_notes}
+
 Examples
 --------
 Read a Stata dta file:
@@ -176,6 +186,8 @@
 {_statafile_processing_params1}
 {_statafile_processing_params2}
 {_chunksize_params}
+
+{_reader_notes}
 """
 
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 87441f238ac51..aa3aa61bbb984 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1969,3 +1969,17 @@ def test_iterator_errors(dirpath):
     with pytest.raises(ValueError, match="chunksize must be set to a positive"):
         with StataReader(dta_file) as reader:
             reader.__next__()
+
+
+def test_iterator_value_labels():
+    # GH 31544
+    values = ["c_label", "b_label"] + ["a_label"] * 500
+    df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
+    with tm.ensure_clean() as path:
+        df.to_stata(path, write_index=False)
+        reader = pd.read_stata(path, chunksize=100)
+        expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
+        for j, chunk in enumerate(reader):
+            for i in range(2):
+                tm.assert_index_equal(chunk.dtypes[i].categories, expected)
+            tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])