From 6dbcdeadf5fa50d56a7ead060d7a832021e74ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:32:23 +0100 Subject: [PATCH 01/12] fix `codecs.replace_errors` --- Lib/test/test_capi/test_codecs.py | 7 ++-- Python/codecs.c | 54 ++++++++++++++++++------------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index a557e35e68915d..d202b23eee7597 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -839,7 +839,8 @@ def test_codec_ignore_errors_handler(self): def test_codec_replace_errors_handler(self): handler = _testcapi.codec_replace_errors - self.do_test_codec_errors_handler(handler, self.all_unicode_errors) + self.do_test_codec_errors_handler(handler, self.all_unicode_errors, + safe=True) def test_codec_xmlcharrefreplace_errors_handler(self): handler = _testcapi.codec_xmlcharrefreplace_errors @@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self): handler = _testlimitedcapi.codec_namereplace_errors self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) - def do_test_codec_errors_handler(self, handler, exceptions): + def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False): at_least_one = False for exc in exceptions: # See https://github.com/python/cpython/issues/123378 and related # discussion and issues for details. - if self._exception_may_crash(exc): + if not safe and self._exception_may_crash(exc): continue at_least_one = True diff --git a/Python/codecs.c b/Python/codecs.c index 2cb3875db35058..b285151338f80b 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -702,48 +702,55 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) PyObject *PyCodec_ReplaceErrors(PyObject *exc) { - Py_ssize_t start, end, i, len; + Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - PyObject *res; - Py_UCS1 *outp; - if (PyUnicodeEncodeError_GetStart(exc, &start)) + if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) + } + if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { return NULL; - len = end - start; - res = PyUnicode_New(len, '?'); - if (res == NULL) + } + if (end <= start) { + goto oob; + } + Py_ssize_t len = end - start; + PyObject *res = PyUnicode_New(len, '?'); + if (res == NULL) { return NULL; + } assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); - outp = PyUnicode_1BYTE_DATA(res); - for (i = 0; i < len; ++i) - outp[i] = '?'; + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + memset(outp, (int)'?', len); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (PyUnicodeDecodeError_GetEnd(exc, &end)) + if (PyUnicodeDecodeError_GetEnd(exc, &end)) { return NULL; + } return Py_BuildValue("(Cn)", (int)Py_UNICODE_REPLACEMENT_CHARACTER, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - PyObject *res; - Py_UCS2 *outp; - if (PyUnicodeTranslateError_GetStart(exc, &start)) + if (PyUnicodeTranslateError_GetStart(exc, &start) < 0) { return NULL; - if (PyUnicodeTranslateError_GetEnd(exc, &end)) + } + if (PyUnicodeTranslateError_GetEnd(exc, &end) < 0) { return NULL; - len = end - start; - res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); - if (res == NULL) + } + if (end <= start) { + goto oob; + } + Py_ssize_t len = end - start; + PyObject *res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); + if (res == NULL) { return NULL; + } assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); - outp = PyUnicode_2BYTE_DATA(res); - for (i = 0; i < len; i++) - outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; + Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); + memset(outp, (int)Py_UNICODE_REPLACEMENT_CHARACTER, len); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } @@ -751,6 +758,9 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } + +oob: + return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) From 193c051095f65fc712f4c4bda27ca780039d20f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:33:01 +0100 Subject: [PATCH 02/12] blurb --- .../2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst new file mode 100644 index 00000000000000..de70c59ee48eec --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-32-58.gh-issue-126004.CYAwTB.rst @@ -0,0 +1,3 @@ +Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end` +values in the :func:`codecs.replace_errors` error handler. Patch by Bénédikt +Tran. From a9140f549db1bb2f30d8b7bd65d9626f6c56850a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:47:54 +0100 Subject: [PATCH 03/12] fix tests --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index b285151338f80b..1ce975b1e411fb 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -721,7 +721,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); - memset(outp, (int)'?', len); + memset(outp, (int)'?', sizeof(Py_UCS1) * len); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } From 98db6a2958548f403f8de55bdc442f64c695b1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:41:12 +0100 Subject: [PATCH 04/12] fix tests --- Python/codecs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 1ce975b1e411fb..a353db590207bf 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -750,7 +750,9 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - memset(outp, (int)Py_UNICODE_REPLACEMENT_CHARACTER, len); + for (Py_ssize_t i = 0; i < len; ++i) { + outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; + } assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } From 37ffb437624cee076d85cb99c53da9e3628213c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:55:04 +0100 Subject: [PATCH 05/12] cosmetic changes --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index a353db590207bf..bd563f4ab6c135 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -726,7 +726,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) return Py_BuildValue("(Nn)", res, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (PyUnicodeDecodeError_GetEnd(exc, &end)) { + if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { return NULL; } return Py_BuildValue("(Cn)", From 4fe70a00a59915f2d78709b82fc558c9144d2f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:12:36 +0100 Subject: [PATCH 06/12] use internal `_PyUnicodeError_GetParams` helper --- Python/codecs.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index bd563f4ab6c135..1e475845474dd8 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -705,10 +705,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, true) < 0) { return NULL; } if (end <= start) { @@ -726,7 +723,8 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) return Py_BuildValue("(Nn)", res, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { + // _PyUnicodeError_GetParams() is slightly faster than the public getter + if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, &end, true) < 0) { return NULL; } return Py_BuildValue("(Cn)", @@ -734,10 +732,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (PyUnicodeTranslateError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeTranslateError_GetEnd(exc, &end) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) { return NULL; } if (end <= start) { From 24bb0305073831a02200a2c0972b7b703d874eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:25:42 +0100 Subject: [PATCH 07/12] fixup --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 1e475845474dd8..46531de17c92e1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -705,7 +705,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, true) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) { return NULL; } if (end <= start) { From 31235a71254dca0a4b91c6cc79c14ed25b77e7b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:56:55 +0100 Subject: [PATCH 08/12] update usages of `_PyUnicodeError_GetParams` --- Python/codecs.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 46531de17c92e1..e9e0fed1abadca 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -700,31 +700,30 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) } -PyObject *PyCodec_ReplaceErrors(PyObject *exc) +PyObject * +PyCodec_ReplaceErrors(PyObject *exc) { - Py_ssize_t start, end; + Py_ssize_t start, end, slen; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, + &start, &end, &slen, false) < 0) { return NULL; } - if (end <= start) { - goto oob; - } - Py_ssize_t len = end - start; - PyObject *res = PyUnicode_New(len, '?'); + PyObject *res = PyUnicode_New(slen, '?'); if (res == NULL) { return NULL; } assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); - memset(outp, (int)'?', sizeof(Py_UCS1) * len); + memset(outp, (int)'?', sizeof(Py_UCS1) * slen); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { // _PyUnicodeError_GetParams() is slightly faster than the public getter - if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, &end, true) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, + NULL, &end, NULL, true) < 0) { return NULL; } return Py_BuildValue("(Cn)", @@ -732,20 +731,17 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, &start, &end, false) < 0) { + if (_PyUnicodeError_GetParams(exc, NULL, NULL, + &start, &end, &slen, false) < 0) { return NULL; } - if (end <= start) { - goto oob; - } - Py_ssize_t len = end - start; - PyObject *res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); + PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER); if (res == NULL) { return NULL; } - assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); + assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - for (Py_ssize_t i = 0; i < len; ++i) { + for (Py_ssize_t i = 0; i < slen; ++i) { outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; } assert(_PyUnicode_CheckConsistency(res, 1)); @@ -755,9 +751,6 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } - -oob: - return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) From 7626ceb37f23cf1b23eb0c0e4a569328c7db8ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:00:21 +0100 Subject: [PATCH 09/12] amend some cosmetic changes to be consistent --- Python/codecs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index e9e0fed1abadca..c321c6d2e50ced 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -700,8 +700,7 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) } -PyObject * -PyCodec_ReplaceErrors(PyObject *exc) +PyObject *PyCodec_ReplaceErrors(PyObject *exc) { Py_ssize_t start, end, slen; From 306b8f6dcc52282023f5d200b8042f37ef2cf5f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:56:37 +0100 Subject: [PATCH 10/12] remove unnecessary cast and comment --- Python/codecs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index c321c6d2e50ced..1b4ce255ad801f 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -715,12 +715,11 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); - memset(outp, (int)'?', sizeof(Py_UCS1) * slen); + memset(outp, '?', sizeof(Py_UCS1) * slen); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - // _PyUnicodeError_GetParams() is slightly faster than the public getter if (_PyUnicodeError_GetParams(exc, NULL, NULL, NULL, &end, NULL, true) < 0) { return NULL; From d71b3a592279455e733ef3b616ca98582d84bcbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 14:21:15 +0100 Subject: [PATCH 11/12] use memset instead of a for-loop --- Python/codecs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 1b4ce255ad801f..005801bcda3612 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -739,9 +739,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - for (Py_ssize_t i = 0; i < slen; ++i) { - outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; - } + memset(outp, Py_UNICODE_REPLACEMENT_CHARACTER, sizeof(Py_UCS2) * slen); assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } From 39951be5d9c84171ad1e6200f09b807933c5662a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 14:44:12 +0100 Subject: [PATCH 12/12] Revert "use memset instead of a for-loop" This reverts commit d71b3a592279455e733ef3b616ca98582d84bcbb. --- Python/codecs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 005801bcda3612..1b4ce255ad801f 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -739,7 +739,9 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - memset(outp, Py_UNICODE_REPLACEMENT_CHARACTER, sizeof(Py_UCS2) * slen); + for (Py_ssize_t i = 0; i < slen; ++i) { + outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; + } assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); }