Skip to content

Commit dc00bd8

Browse files
committed
MAINT: Deprecate encoding from stata reader/writer
Deprecate the encoding parameter from all Stata reading and writing methods and classes. The encoding depends only on the file format and cannot be changed by users.
1 parent 415012f commit dc00bd8

File tree

4 files changed

+24
-26
lines changed

4 files changed

+24
-26
lines changed

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Other API Changes
4545
Deprecations
4646
~~~~~~~~~~~~
4747

48-
-
48+
- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
4949
-
5050
-
5151

pandas/core/frame.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@
8080
from pandas.compat import PY36
8181
from pandas.compat.numpy import function as nv
8282
from pandas.util._decorators import (Appender, Substitution,
83-
rewrite_axis_style_signature)
83+
rewrite_axis_style_signature,
84+
deprecate_kwarg)
8485
from pandas.util._validators import (validate_bool_kwarg,
8586
validate_axis_style_args)
8687

@@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
17641765
startcol=startcol, freeze_panes=freeze_panes,
17651766
engine=engine)
17661767

1768+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
17671769
def to_stata(self, fname, convert_dates=None, write_index=True,
17681770
encoding="latin-1", byteorder=None, time_stamp=None,
17691771
data_label=None, variable_labels=None, version=114,
@@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18691871
kwargs['convert_strl'] = convert_strl
18701872

18711873
writer = statawriter(fname, self, convert_dates=convert_dates,
1872-
encoding=encoding, byteorder=byteorder,
1873-
time_stamp=time_stamp, data_label=data_label,
1874-
write_index=write_index,
1874+
byteorder=byteorder, time_stamp=time_stamp,
1875+
data_label=data_label, write_index=write_index,
18751876
variable_labels=variable_labels, **kwargs)
18761877
writer.write_file()
18771878

pandas/io/stata.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@
3333
from pandas.core.series import Series
3434
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
3535
_stringify_path)
36-
from pandas.util._decorators import Appender
37-
from pandas.util._decorators import deprecate_kwarg
36+
from pandas.util._decorators import Appender, deprecate_kwarg
3837

3938
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
4039
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
@@ -169,6 +168,7 @@
169168

170169

171170
@Appender(_read_stata_doc)
171+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
172172
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
173173
def read_stata(filepath_or_buffer, convert_dates=True,
174174
convert_categoricals=True, encoding=None, index_col=None,
@@ -952,6 +952,7 @@ def __init__(self):
952952
class StataReader(StataParser, BaseIterator):
953953
__doc__ = _stata_reader_doc
954954

955+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
955956
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
956957
def __init__(self, path_or_buf, convert_dates=True,
957958
convert_categoricals=True, index_col=None,
@@ -970,7 +971,7 @@ def __init__(self, path_or_buf, convert_dates=True,
970971
self._preserve_dtypes = preserve_dtypes
971972
self._columns = columns
972973
self._order_categoricals = order_categoricals
973-
self._encoding = encoding
974+
self._encoding = None
974975
self._chunksize = chunksize
975976

976977
# State variables for the file
@@ -1962,17 +1963,14 @@ class StataWriter(StataParser):
19621963

19631964
_max_string_length = 244
19641965

1966+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
19651967
def __init__(self, fname, data, convert_dates=None, write_index=True,
19661968
encoding="latin-1", byteorder=None, time_stamp=None,
19671969
data_label=None, variable_labels=None):
19681970
super(StataWriter, self).__init__()
19691971
self._convert_dates = {} if convert_dates is None else convert_dates
19701972
self._write_index = write_index
1971-
if encoding is not None:
1972-
if encoding not in VALID_ENCODINGS:
1973-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1974-
'supported.')
1975-
self._encoding = encoding
1973+
self._encoding = 'latin-1'
19761974
self._time_stamp = time_stamp
19771975
self._data_label = data_label
19781976
self._variable_labels = variable_labels
@@ -2731,16 +2729,18 @@ class StataWriter117(StataWriter):
27312729

27322730
_max_string_length = 2045
27332731

2732+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
27342733
def __init__(self, fname, data, convert_dates=None, write_index=True,
27352734
encoding="latin-1", byteorder=None, time_stamp=None,
27362735
data_label=None, variable_labels=None, convert_strl=None):
27372736
# Shallow copy since convert_strl might be modified later
27382737
self._convert_strl = [] if convert_strl is None else convert_strl[:]
27392738

27402739
super(StataWriter117, self).__init__(fname, data, convert_dates,
2741-
write_index, encoding, byteorder,
2742-
time_stamp, data_label,
2743-
variable_labels)
2740+
write_index, byteorder=byteorder,
2741+
time_stamp=time_stamp,
2742+
data_label=data_label,
2743+
variable_labels=variable_labels)
27442744
self._map = None
27452745
self._strl_blob = None
27462746

pandas/tests/io/test_stata.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -361,16 +361,20 @@ def test_encoding(self, version):
361361

362362
# GH 4626, proper encoding handling
363363
raw = read_stata(self.dta_encoding)
364-
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364+
with warnings.catch_warnings(record=True) as w:
365+
encoded = read_stata(self.dta_encoding, encoding='latin-1')
366+
assert len(w) == 1
365367
result = encoded.kreis1849[0]
366368

367369
expected = raw.kreis1849[0]
368370
assert result == expected
369371
assert isinstance(result, compat.string_types)
370372

371373
with tm.ensure_clean() as path:
372-
encoded.to_stata(path, encoding='latin-1',
373-
write_index=False, version=version)
374+
with warnings.catch_warnings(record=True) as w:
375+
encoded.to_stata(path, write_index=False, version=version,
376+
encoding='latin-1')
377+
assert len(w) == 1
374378
reread_encoded = read_stata(path)
375379
tm.assert_frame_equal(encoded, reread_encoded)
376380

@@ -1349,13 +1353,6 @@ def test_out_of_range_float(self):
13491353
assert 'ColumnTooBig' in cm.exception
13501354
assert 'infinity' in cm.exception
13511355

1352-
def test_invalid_encoding(self):
1353-
# GH15723, validate encoding
1354-
original = self.read_csv(self.csv3)
1355-
with pytest.raises(ValueError):
1356-
with tm.ensure_clean() as path:
1357-
original.to_stata(path, encoding='utf-8')
1358-
13591356
def test_path_pathlib(self):
13601357
df = tm.makeDataFrame()
13611358
df.index.name = 'index'

0 commit comments

Comments
 (0)