From f9ae3cd270389f2937ea558cb5305fc9414f3649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 13:38:36 +0100 Subject: [PATCH 1/2] Use `_PyUnicodeError_GetParams` for the `backslashreplace` handler. We also refactor that handler and extract the logic for each exceptions being handled into separate functions. --- Python/codecs.c | 111 ++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 41 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 6c9f8222079ec8..a69cc107f69100 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -936,49 +936,18 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) return restuple; } -PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) + +// --- handler: 'backslashreplace' -------------------------------------------- + +static PyObject * +_PyCodec_BackslashReplaceUnicodeEncodeError(PyObject *exc) { PyObject *obj; Py_ssize_t objlen, start, end, slen; - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (_PyUnicodeError_GetParams(exc, - &obj, &objlen, - &start, &end, &slen, true) < 0) - { - return NULL; - } - PyObject *res = PyUnicode_New(4 * slen, 127); - if (res == NULL) { - Py_DECREF(obj); - return NULL; - } - Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); - const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); - for (Py_ssize_t i = start; i < end; i++, outp += 4) { - const unsigned char ch = p[i]; - outp[0] = '\\'; - outp[1] = 'x'; - outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; - outp[3] = Py_hexdigits[ch & 0xf]; - } - assert(_PyUnicode_CheckConsistency(res, 1)); - Py_DECREF(obj); - return Py_BuildValue("(Nn)", res, end); - } - - if ( - PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError) - || PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError) - ) { - if (_PyUnicodeError_GetParams(exc, - &obj, &objlen, - &start, &end, &slen, false) < 0) - { - return NULL; - } - } - else { - wrong_exception_type(exc); + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, + &start, &end, &slen, false) < 0) + { return NULL; } @@ -1015,6 +984,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } +static PyObject * +_PyCodec_BackslashReplaceUnicodeDecodeError(PyObject *exc) +{ + PyObject *obj; + Py_ssize_t objlen, start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, + &start, &end, &slen, true) < 0) + { + return NULL; + } + + PyObject *res = PyUnicode_New(4 * slen, 127); + if (res == NULL) { + Py_DECREF(obj); + return NULL; + } + + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); + for (Py_ssize_t i = start; i < end; i++, outp += 4) { + const unsigned char ch = p[i]; + outp[0] = '\\'; + outp[1] = 'x'; + outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; + outp[3] = Py_hexdigits[ch & 0xf]; + } + assert(_PyUnicode_CheckConsistency(res, 1)); + Py_DECREF(obj); + return Py_BuildValue("(Nn)", res, end); +} + + +static inline PyObject * +_PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) +{ + // Same implementation as for UnicodeEncodeError objects. + return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); +} + + +PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) +{ + if (_PyIsUnicodeDecodeError(exc)) { + return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); + } + else if (_PyIsUnicodeDecodeError(exc)) { + return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); + } + else if (_PyIsUnicodeTranslateError(exc)) { + return _PyCodec_BackslashReplaceUnicodeTranslateError(exc); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + + // --- handler: 'namereplace' ------------------------------------------------- PyObject *PyCodec_NameReplaceErrors(PyObject *exc) @@ -1425,7 +1453,8 @@ static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) } -static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) +static inline PyObject * +backslashreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_BackslashReplaceErrors(exc); } From 8965abb6a932a3f5af23b467f6bfe23e33f5d7ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 14:08:34 +0100 Subject: [PATCH 2/2] fix compilation --- Python/codecs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index a69cc107f69100..0b557e97c261f0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1027,11 +1027,11 @@ _PyCodec_BackslashReplaceUnicodeTranslateError(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { - if (_PyIsUnicodeDecodeError(exc)) { - return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); + if (_PyIsUnicodeEncodeError(exc)) { + return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); } else if (_PyIsUnicodeDecodeError(exc)) { - return _PyCodec_BackslashReplaceUnicodeEncodeError(exc); + return _PyCodec_BackslashReplaceUnicodeDecodeError(exc); } else if (_PyIsUnicodeTranslateError(exc)) { return _PyCodec_BackslashReplaceUnicodeTranslateError(exc);