Skip to content

Commit bc2d45e

Browse files
committed
BUG: Fix encoding for Stata format 118 files
Ensure that Stata 118 files always use utf-8 encoding
1 parent 5348e06 commit bc2d45e

File tree

4 files changed

+46
-26
lines changed

4 files changed

+46
-26
lines changed

doc/source/whatsnew/v0.23.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ I/O
9191

9292
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9393
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
94-
-
94+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
9595

9696
Plotting
9797
^^^^^^^^

pandas/io/stata.py

+26-17
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,12 @@
3636
from pandas.util._decorators import Appender
3737
from pandas.util._decorators import deprecate_kwarg
3838

39-
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
39+
# Allowed encodings of Stata dta files. Preferred is first entry
40+
VALID_ENCODINGS = ('latin-1', 'latin_1', 'ascii', 'us-ascii', 'iso-8859-1',
4041
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
4142

43+
VALID_ENCODINGS_118 = ('utf8', 'utf-8')
44+
4245
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4346
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
4447
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -838,7 +841,6 @@ def get_base_missing_value(cls, dtype):
838841

839842

840843
class StataParser(object):
841-
_default_encoding = 'latin-1'
842844

843845
def __init__(self, encoding):
844846
if encoding is not None:
@@ -964,7 +966,7 @@ def __init__(self, path_or_buf, convert_dates=True,
964966
convert_categoricals=True, index_col=None,
965967
convert_missing=False, preserve_dtypes=True,
966968
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
969+
encoding=None, chunksize=None):
968970
super(StataReader, self).__init__(encoding)
969971
self.col_sizes = ()
970972

@@ -977,10 +979,7 @@ def __init__(self, path_or_buf, convert_dates=True,
977979
self._preserve_dtypes = preserve_dtypes
978980
self._columns = columns
979981
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
982+
self._default_encoding = None
984983
self._encoding = encoding
985984
self._chunksize = chunksize
986985

@@ -1030,6 +1029,21 @@ def close(self):
10301029
except IOError:
10311030
pass
10321031

1032+
def _check_encoding(self):
1033+
"""
1034+
Check validity of user-set encoding set the default encoding
1035+
"""
1036+
if self.format_version < 118:
1037+
valid_encodings = VALID_ENCODINGS
1038+
else:
1039+
valid_encodings = VALID_ENCODINGS_118
1040+
if self._encoding is not None:
1041+
if self._encoding not in valid_encodings:
1042+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1043+
'supported.')
1044+
# Preferred encoding is first in valid_encodings
1045+
self._default_encoding = valid_encodings[0]
1046+
10331047
def _read_header(self):
10341048
first_char = self.path_or_buf.read(1)
10351049
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1063,7 @@ def _read_new_header(self, first_char):
10491063
self.format_version = int(self.path_or_buf.read(3))
10501064
if self.format_version not in [117, 118]:
10511065
raise ValueError(_version_error)
1066+
self._check_encoding()
10521067
self.path_or_buf.read(21) # </release><byteorder>
10531068
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541069
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1250,7 @@ def _read_old_header(self, first_char):
12351250
self.format_version = struct.unpack('b', first_char)[0]
12361251
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371252
raise ValueError(_version_error)
1253+
self._check_encoding()
12381254
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391255
0] == 0x1 and '>' or '<'
12401256
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1354,9 @@ def _decode(self, s):
13381354
return s.decode('utf-8')
13391355

13401356
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1357+
# have bytes not strings, so must decode
1358+
s = s.partition(b"\0")[0]
1359+
return s.decode(self._encoding or self._default_encoding)
13511360

13521361
def _read_value_labels(self):
13531362
if self._value_labels_read:

pandas/tests/io/data/stata16_118.dta

4.51 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def setup_method(self, method):
9696
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
9797

9898
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
99+
self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
99100

100101
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101102

@@ -363,14 +364,9 @@ def test_encoding(self, version):
363364
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364365
result = encoded.kreis1849[0]
365366

366-
if compat.PY3:
367-
expected = raw.kreis1849[0]
368-
assert result == expected
369-
assert isinstance(result, compat.string_types)
370-
else:
371-
expected = raw.kreis1849.str.decode("latin-1")[0]
372-
assert result == expected
373-
assert isinstance(result, unicode) # noqa
367+
expected = raw.kreis1849[0]
368+
assert result == expected
369+
assert isinstance(result, compat.string_types)
374370

375371
with tm.ensure_clean() as path:
376372
encoded.to_stata(path, encoding='latin-1',
@@ -1500,3 +1496,18 @@ def test_gzip_writing(self):
15001496
with gzip.GzipFile(path, 'rb') as gz:
15011497
reread = pd.read_stata(gz, index_col='index')
15021498
tm.assert_frame_equal(df, reread)
1499+
1500+
def test_unicode_dta_118(self):
1501+
unicode_df = self.read_dta(self.dta25_118)
1502+
1503+
columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
1504+
values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
1505+
[u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
1506+
[u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
1507+
[' ', ' ', 'd', ' ', 'd'],
1508+
[' ', '', 'a', ' ', 'a'],
1509+
['', '', 's', '', 's'],
1510+
['', '', ' ', '', ' ']]
1511+
expected = pd.DataFrame(values, columns=columns)
1512+
1513+
tm.assert_frame_equal(unicode_df, expected)

0 commit comments

Comments
 (0)