Skip to content

Commit d3b437c

Browse files
[3.12] gh-123378: fix a crash in UnicodeError.__str__ (GH-124935) (#125098)
gh-123378: fix a crash in `UnicodeError.__str__` (GH-124935) (cherry picked from commit ba14dfa) Co-authored-by: Bénédikt Tran <[email protected]>
1 parent 8225737 commit d3b437c

File tree

3 files changed

+93
-45
lines changed

3 files changed

+93
-45
lines changed

Lib/test/test_exceptions.py

+24
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import weakref
99
import errno
1010
from codecs import BOM_UTF8
11+
from itertools import product
1112
from textwrap import dedent
1213

1314
from test.support import (captured_stderr, check_impl_detail,
@@ -1333,6 +1334,29 @@ def test_unicode_errors_no_object(self):
13331334
for klass in klasses:
13341335
self.assertEqual(str(klass.__new__(klass)), "")
13351336

1337+
def test_unicode_error_str_does_not_crash(self):
1338+
# Test that str(UnicodeError(...)) does not crash.
1339+
# See https://github.com/python/cpython/issues/123378.
1340+
1341+
for start, end, objlen in product(
1342+
range(-5, 5),
1343+
range(-5, 5),
1344+
range(7),
1345+
):
1346+
obj = 'a' * objlen
1347+
with self.subTest('encode', objlen=objlen, start=start, end=end):
1348+
exc = UnicodeEncodeError('utf-8', obj, start, end, '')
1349+
self.assertIsInstance(str(exc), str)
1350+
1351+
with self.subTest('translate', objlen=objlen, start=start, end=end):
1352+
exc = UnicodeTranslateError(obj, start, end, '')
1353+
self.assertIsInstance(str(exc), str)
1354+
1355+
encoded = obj.encode()
1356+
with self.subTest('decode', objlen=objlen, start=start, end=end):
1357+
exc = UnicodeDecodeError('utf-8', encoded, start, end, '')
1358+
self.assertIsInstance(str(exc), str)
1359+
13361360
@no_tracing
13371361
def test_badisinstance(self):
13381362
# Bug #2542: if issubclass(e, MyException) raises an exception,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix a crash in the :meth:`~object.__str__` method of :exc:`UnicodeError`
2+
objects when the :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
3+
values are invalid or out-of-range. Patch by Bénédikt Tran.

Objects/exceptions.c

+66-45
Original file line numberDiff line numberDiff line change
@@ -2961,46 +2961,55 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
29612961
static PyObject *
29622962
UnicodeEncodeError_str(PyObject *self)
29632963
{
2964-
PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
2964+
PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
29652965
PyObject *result = NULL;
29662966
PyObject *reason_str = NULL;
29672967
PyObject *encoding_str = NULL;
29682968

2969-
if (!uself->object)
2969+
if (exc->object == NULL) {
29702970
/* Not properly initialized. */
29712971
return PyUnicode_FromString("");
2972+
}
29722973

29732974
/* Get reason and encoding as strings, which they might not be if
29742975
they've been modified after we were constructed. */
2975-
reason_str = PyObject_Str(uself->reason);
2976-
if (reason_str == NULL)
2976+
reason_str = PyObject_Str(exc->reason);
2977+
if (reason_str == NULL) {
29772978
goto done;
2978-
encoding_str = PyObject_Str(uself->encoding);
2979-
if (encoding_str == NULL)
2979+
}
2980+
encoding_str = PyObject_Str(exc->encoding);
2981+
if (encoding_str == NULL) {
29802982
goto done;
2983+
}
2984+
2985+
Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
2986+
Py_ssize_t start = exc->start, end = exc->end;
29812987

2982-
if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
2983-
Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
2988+
if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
2989+
Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
29842990
const char *fmt;
2985-
if (badchar <= 0xff)
2991+
if (badchar <= 0xff) {
29862992
fmt = "'%U' codec can't encode character '\\x%02x' in position %zd: %U";
2987-
else if (badchar <= 0xffff)
2993+
}
2994+
else if (badchar <= 0xffff) {
29882995
fmt = "'%U' codec can't encode character '\\u%04x' in position %zd: %U";
2989-
else
2996+
}
2997+
else {
29902998
fmt = "'%U' codec can't encode character '\\U%08x' in position %zd: %U";
2999+
}
29913000
result = PyUnicode_FromFormat(
29923001
fmt,
29933002
encoding_str,
29943003
(int)badchar,
2995-
uself->start,
3004+
start,
29963005
reason_str);
29973006
}
29983007
else {
29993008
result = PyUnicode_FromFormat(
30003009
"'%U' codec can't encode characters in position %zd-%zd: %U",
30013010
encoding_str,
3002-
uself->start,
3003-
uself->end-1,
3011+
start,
3012+
end - 1,
30043013
reason_str);
30053014
}
30063015
done:
@@ -3074,41 +3083,46 @@ UnicodeDecodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
30743083
static PyObject *
30753084
UnicodeDecodeError_str(PyObject *self)
30763085
{
3077-
PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
3086+
PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
30783087
PyObject *result = NULL;
30793088
PyObject *reason_str = NULL;
30803089
PyObject *encoding_str = NULL;
30813090

3082-
if (!uself->object)
3091+
if (exc->object == NULL) {
30833092
/* Not properly initialized. */
30843093
return PyUnicode_FromString("");
3094+
}
30853095

30863096
/* Get reason and encoding as strings, which they might not be if
30873097
they've been modified after we were constructed. */
3088-
reason_str = PyObject_Str(uself->reason);
3089-
if (reason_str == NULL)
3098+
reason_str = PyObject_Str(exc->reason);
3099+
if (reason_str == NULL) {
30903100
goto done;
3091-
encoding_str = PyObject_Str(uself->encoding);
3092-
if (encoding_str == NULL)
3101+
}
3102+
encoding_str = PyObject_Str(exc->encoding);
3103+
if (encoding_str == NULL) {
30933104
goto done;
3105+
}
3106+
3107+
Py_ssize_t len = PyBytes_GET_SIZE(exc->object);
3108+
Py_ssize_t start = exc->start, end = exc->end;
30943109

3095-
if (uself->start < PyBytes_GET_SIZE(uself->object) && uself->end == uself->start+1) {
3096-
int byte = (int)(PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[uself->start]&0xff);
3110+
if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
3111+
int badbyte = (int)(PyBytes_AS_STRING(exc->object)[start] & 0xff);
30973112
result = PyUnicode_FromFormat(
30983113
"'%U' codec can't decode byte 0x%02x in position %zd: %U",
30993114
encoding_str,
3100-
byte,
3101-
uself->start,
3115+
badbyte,
3116+
start,
31023117
reason_str);
31033118
}
31043119
else {
31053120
result = PyUnicode_FromFormat(
31063121
"'%U' codec can't decode bytes in position %zd-%zd: %U",
31073122
encoding_str,
3108-
uself->start,
3109-
uself->end-1,
3110-
reason_str
3111-
);
3123+
start,
3124+
end - 1,
3125+
reason_str);
31123126
}
31133127
done:
31143128
Py_XDECREF(reason_str);
@@ -3171,42 +3185,49 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args,
31713185
static PyObject *
31723186
UnicodeTranslateError_str(PyObject *self)
31733187
{
3174-
PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
3188+
PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
31753189
PyObject *result = NULL;
31763190
PyObject *reason_str = NULL;
31773191

3178-
if (!uself->object)
3192+
if (exc->object == NULL) {
31793193
/* Not properly initialized. */
31803194
return PyUnicode_FromString("");
3195+
}
31813196

31823197
/* Get reason as a string, which it might not be if it's been
31833198
modified after we were constructed. */
3184-
reason_str = PyObject_Str(uself->reason);
3185-
if (reason_str == NULL)
3199+
reason_str = PyObject_Str(exc->reason);
3200+
if (reason_str == NULL) {
31863201
goto done;
3202+
}
3203+
3204+
Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
3205+
Py_ssize_t start = exc->start, end = exc->end;
31873206

3188-
if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
3189-
Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
3207+
if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
3208+
Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
31903209
const char *fmt;
3191-
if (badchar <= 0xff)
3210+
if (badchar <= 0xff) {
31923211
fmt = "can't translate character '\\x%02x' in position %zd: %U";
3193-
else if (badchar <= 0xffff)
3212+
}
3213+
else if (badchar <= 0xffff) {
31943214
fmt = "can't translate character '\\u%04x' in position %zd: %U";
3195-
else
3215+
}
3216+
else {
31963217
fmt = "can't translate character '\\U%08x' in position %zd: %U";
3218+
}
31973219
result = PyUnicode_FromFormat(
31983220
fmt,
31993221
(int)badchar,
3200-
uself->start,
3201-
reason_str
3202-
);
3203-
} else {
3222+
start,
3223+
reason_str);
3224+
}
3225+
else {
32043226
result = PyUnicode_FromFormat(
32053227
"can't translate characters in position %zd-%zd: %U",
3206-
uself->start,
3207-
uself->end-1,
3208-
reason_str
3209-
);
3228+
start,
3229+
end - 1,
3230+
reason_str);
32103231
}
32113232
done:
32123233
Py_XDECREF(reason_str);

0 commit comments

Comments
 (0)