diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index f641e7ba4c6f7..18ba5fd46696a 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) - Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`) - Fixed regression in certain offsets (:meth:`pd.offsets.Day() ` and below) no longer being hashable (:issue:`37267`) +- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..8e1c72d4aaa7e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning): precision_loss_doc = """ -Column converted from %s to %s, and some data are outside of the lossless +Column converted from {0} to {1}, and some data are outside of the lossless conversion range. This may result in a loss of precision in the saved data. """ @@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: object in a DataFrame. """ ws = "" - # original, if small, if large + # original, if small, if large conversion_data = ( (np.bool_, np.int8, np.int8), (np.uint8, np.int8, np.int16), @@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: dtype = c_data[1] else: dtype = c_data[2] - if c_data[2] == np.float64: # Warn if necessary + if c_data[2] == np.int64: # Warn if necessary if data[col].max() >= 2 ** 53: ws = precision_loss_doc.format("uint64", "float64") @@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) self.text_len = 0 - self.off: List[int] = [] - self.val: List[int] = [] self.txt: List[bytes] = [] self.n = 0 # Compute lengths and setup lists of offsets and labels + offsets: List[int] = [] + values: List[int] = [] for vl in self.value_labels: category = vl[1] if not isinstance(category, str): @@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ValueLabelTypeMismatch, ) category = category.encode(encoding) - self.off.append(self.text_len) + offsets.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding - self.val.append(vl[0]) + values.append(vl[0]) self.txt.append(category) self.n += 1 @@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ) # Ensure int32 - self.off = np.array(self.off, dtype=np.int32) - self.val = np.array(self.val, dtype=np.int32) + self.off = np.array(offsets, dtype=np.int32) + self.val = np.array(values, dtype=np.int32) # Total length self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len @@ -876,23 +876,23 @@ def __init__(self): # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)])) + list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [ - (251, np.int8), - (252, np.int16), - (253, np.int32), - (254, np.float32), - (255, np.float64), + (251, np.dtype(np.int8)), + (252, np.dtype(np.int16)), + (253, np.dtype(np.int32)), + (254, np.dtype(np.float32)), + (255, np.dtype(np.float64)), ] ) self.DTYPE_MAP_XML = dict( [ - (32768, np.uint8), # Keys to GSO - (65526, np.float64), - (65527, np.float32), - (65528, np.int32), - (65529, np.int16), - (65530, np.int8), + (32768, np.dtype(np.uint8)), # Keys to GSO + (65526, np.dtype(np.float64)), + (65527, np.dtype(np.float32)), + (65528, np.dtype(np.int32)), + (65529, np.dtype(np.int16)), + (65530, np.dtype(np.int8)), ] ) self.TYPE_MAP = list(range(251)) + list("bhlfd") @@ -1050,9 +1050,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize - if self._chunksize is not None and ( - not isinstance(chunksize, int) or chunksize <= 0 - ): + self._using_iterator = False + if self._chunksize is None: + self._chunksize = 1 + elif not isinstance(chunksize, int) or chunksize <= 0: raise ValueError("chunksize must be a positive integer when set.") # State variables for the file @@ -1062,7 +1063,7 @@ def __init__( self._column_selector_set = False self._value_labels_read = False self._data_read = False - self._dtype = None + self._dtype: Optional[np.dtype] = None self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) @@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None: # Get data type information, works for versions 117-119. def _get_dtypes( self, seek_vartypes: int - ) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]: + ) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]: self.path_or_buf.seek(seek_vartypes) raw_typlist = [ @@ -1519,11 +1520,8 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: - if self._chunksize is None: - raise ValueError( - "chunksize must be set to a positive integer to use as an iterator." - ) - return self.read(nrows=self._chunksize or 1) + self._using_iterator = True + return self.read(nrows=self._chunksize) def get_chunk(self, size: Optional[int] = None) -> DataFrame: """ @@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool: convert = False for col in data: dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 + if dtype in (np.dtype(np.float16), np.dtype(np.float32)): + dtype = np.dtype(np.float64) convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 + elif dtype in ( + np.dtype(np.int8), + np.dtype(np.int16), + np.dtype(np.int32), + ): + dtype = np.dtype(np.int64) convert = True retyped_data.append((col, data[col].astype(dtype))) if convert: @@ -1807,14 +1809,14 @@ def _do_convert_categoricals( keys = np.array(list(vl.keys())) column = data[col] key_matches = column.isin(keys) - if self._chunksize is not None and key_matches.all(): - initial_categories = keys + if self._using_iterator and key_matches.all(): + initial_categories: Optional[np.ndarray] = keys # If all categories are in the keys and we are iterating, # use the same keys for all chunks. If some are missing # value labels, then we will fall back to the categories # varying across chunks. else: - if self._chunksize is not None: + if self._using_iterator: # warn is using an iterator warnings.warn( categorical_conversion_warning, CategoricalConversionWarning @@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: "ty", "%ty", ]: - return np.float64 # Stata expects doubles for SIFs + return np.dtype(np.float64) # Stata expects doubles for SIFs else: raise NotImplementedError(f"Format {fmt} not implemented") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d7fec803a8e0..9788602242128 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath): StataReader(dta_file, chunksize=0) with pytest.raises(ValueError, match="chunksize must be a positive"): StataReader(dta_file, chunksize="apple") - with pytest.raises(ValueError, match="chunksize must be set to a positive"): - with StataReader(dta_file) as reader: - reader.__next__() def test_iterator_value_labels(): @@ -1983,3 +1980,18 @@ def test_iterator_value_labels(): for i in range(2): tm.assert_index_equal(chunk.dtypes[i].categories, expected) tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + + +def test_precision_loss(): + df = DataFrame( + [[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]], + columns=["big", "little"], + ) + with tm.ensure_clean() as path: + with tm.assert_produces_warning(PossiblePrecisionLoss): + df.to_stata(path, write_index=False) + reread = read_stata(path) + expected_dt = Series([np.float64, np.float64], index=["big", "little"]) + tm.assert_series_equal(reread.dtypes, expected_dt) + assert reread.loc[0, "little"] == df.loc[0, "little"] + assert reread.loc[0, "big"] == float(df.loc[0, "big"])