36
36
from pandas .util ._decorators import Appender
37
37
from pandas .util ._decorators import deprecate_kwarg
38
38
39
- VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
39
+ # Allowed encodings of Stata dta files. Preferred is first entry
40
+ VALID_ENCODINGS = ('latin-1' , 'latin_1' , 'ascii' , 'us-ascii' , 'iso-8859-1' ,
40
41
'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
41
42
43
+ VALID_ENCODINGS_118 = ('utf8' , 'utf-8' )
44
+
42
45
_version_error = ("Version of given Stata file is not 104, 105, 108, "
43
46
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
44
47
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
@@ -838,7 +841,6 @@ def get_base_missing_value(cls, dtype):
838
841
839
842
840
843
class StataParser (object ):
841
- _default_encoding = 'latin-1'
842
844
843
845
def __init__ (self , encoding ):
844
846
if encoding is not None :
@@ -964,7 +966,7 @@ def __init__(self, path_or_buf, convert_dates=True,
964
966
convert_categoricals = True , index_col = None ,
965
967
convert_missing = False , preserve_dtypes = True ,
966
968
columns = None , order_categoricals = True ,
967
- encoding = 'latin-1' , chunksize = None ):
969
+ encoding = None , chunksize = None ):
968
970
super (StataReader , self ).__init__ (encoding )
969
971
self .col_sizes = ()
970
972
@@ -977,10 +979,7 @@ def __init__(self, path_or_buf, convert_dates=True,
977
979
self ._preserve_dtypes = preserve_dtypes
978
980
self ._columns = columns
979
981
self ._order_categoricals = order_categoricals
980
- if encoding is not None :
981
- if encoding not in VALID_ENCODINGS :
982
- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983
- 'supported.' )
982
+ self ._default_encoding = None
984
983
self ._encoding = encoding
985
984
self ._chunksize = chunksize
986
985
@@ -1030,6 +1029,21 @@ def close(self):
1030
1029
except IOError :
1031
1030
pass
1032
1031
1032
+ def _check_encoding (self ):
1033
+ """
1034
+ Check validity of user-set encoding set the default encoding
1035
+ """
1036
+ if self .format_version < 118 :
1037
+ valid_encodings = VALID_ENCODINGS
1038
+ else :
1039
+ valid_encodings = VALID_ENCODINGS_118
1040
+ if self ._encoding is not None :
1041
+ if self ._encoding not in valid_encodings :
1042
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
1043
+ 'supported.' )
1044
+ # Preferred encoding is first in valid_encodings
1045
+ self ._default_encoding = valid_encodings [0 ]
1046
+
1033
1047
def _read_header (self ):
1034
1048
first_char = self .path_or_buf .read (1 )
1035
1049
if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1063,7 @@ def _read_new_header(self, first_char):
1049
1063
self .format_version = int (self .path_or_buf .read (3 ))
1050
1064
if self .format_version not in [117 , 118 ]:
1051
1065
raise ValueError (_version_error )
1066
+ self ._check_encoding ()
1052
1067
self .path_or_buf .read (21 ) # </release><byteorder>
1053
1068
self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
1054
1069
self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1250,7 @@ def _read_old_header(self, first_char):
1235
1250
self .format_version = struct .unpack ('b' , first_char )[0 ]
1236
1251
if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1237
1252
raise ValueError (_version_error )
1253
+ self ._check_encoding ()
1238
1254
self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
1239
1255
0 ] == 0x1 and '>' or '<'
1240
1256
self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1354,9 @@ def _decode(self, s):
1338
1354
return s .decode ('utf-8' )
1339
1355
1340
1356
def _null_terminate (self , s ):
1341
- if compat .PY3 or self ._encoding is not None :
1342
- # have bytes not strings, so must decode
1343
- s = s .partition (b"\0 " )[0 ]
1344
- return s .decode (self ._encoding or self ._default_encoding )
1345
- else :
1346
- null_byte = "\0 "
1347
- try :
1348
- return s .lstrip (null_byte )[:s .index (null_byte )]
1349
- except :
1350
- return s
1357
+ # have bytes not strings, so must decode
1358
+ s = s .partition (b"\0 " )[0 ]
1359
+ return s .decode (self ._encoding or self ._default_encoding )
1351
1360
1352
1361
def _read_value_labels (self ):
1353
1362
if self ._value_labels_read :
0 commit comments