Skip to content

gh-129173: refactor PyCodec_ReplaceErrors into separate functions #129893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 25, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 83 additions & 40 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,27 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
}


/*
* Create a Unicode string containing 'count' copies of the official
* Unicode REPLACEMENT CHARACTER (0xFFFD).
*/
static PyObject *
codec_handler_unicode_replacement_character(Py_ssize_t count)
{
PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
if (res == NULL) {
return NULL;
}
assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
for (Py_ssize_t i = 0; i < count; ++i) {
outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
}
assert(_PyUnicode_CheckConsistency(res, 1));
return res;
}


// --- handler: 'strict' ------------------------------------------------------

PyObject *PyCodec_StrictErrors(PyObject *exc)
Expand Down Expand Up @@ -774,50 +795,71 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc)
}


PyObject *PyCodec_ReplaceErrors(PyObject *exc)
// --- handler: 'replace' -----------------------------------------------------

static PyObject *
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
{
Py_ssize_t start, end, slen;
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
&start, &end, &slen, false) < 0)
{
return NULL;
}
PyObject *res = PyUnicode_New(slen, '?');
if (res == NULL) {
return NULL;
}
assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
memset(outp, '?', sizeof(Py_UCS1) * slen);
assert(_PyUnicode_CheckConsistency(res, 1));
return Py_BuildValue("(Nn)", res, end);
}

if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
&start, &end, &slen, false) < 0) {
return NULL;
}
PyObject *res = PyUnicode_New(slen, '?');
if (res == NULL) {
return NULL;
}
assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
memset(outp, '?', sizeof(Py_UCS1) * slen);
assert(_PyUnicode_CheckConsistency(res, 1));
return Py_BuildValue("(Nn)", res, end);

static PyObject *
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
{
Py_ssize_t end;
if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
return NULL;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
NULL, &end, NULL, true) < 0) {
return NULL;
}
return Py_BuildValue("(Cn)",
(int)Py_UNICODE_REPLACEMENT_CHARACTER,
end);
PyObject *res = codec_handler_unicode_replacement_character(1);
if (res == NULL) {
return NULL;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
&start, &end, &slen, false) < 0) {
return NULL;
}
PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);
if (res == NULL) {
return NULL;
}
assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
for (Py_ssize_t i = 0; i < slen; ++i) {
outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
}
assert(_PyUnicode_CheckConsistency(res, 1));
return Py_BuildValue("(Nn)", res, end);
return Py_BuildValue("(Nn)", res, end);
}


static PyObject *
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
{
Py_ssize_t start, end, slen;
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
&start, &end, &slen, false) < 0)
{
return NULL;
}
PyObject *res = codec_handler_unicode_replacement_character(slen);
if (res == NULL) {
return NULL;
}
return Py_BuildValue("(Nn)", res, end);
}


PyObject *PyCodec_ReplaceErrors(PyObject *exc)
{
if (_PyIsUnicodeEncodeError(exc)) {
return _PyCodec_ReplaceUnicodeEncodeError(exc);
}
else if (_PyIsUnicodeDecodeError(exc)) {
return _PyCodec_ReplaceUnicodeDecodeError(exc);
}
else if (_PyIsUnicodeTranslateError(exc)) {
return _PyCodec_ReplaceUnicodeTranslateError(exc);
}
else {
wrong_exception_type(exc);
Expand Down Expand Up @@ -1468,7 +1510,8 @@ ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
}


static PyObject *replace_errors(PyObject *self, PyObject *exc)
static inline PyObject *
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_ReplaceErrors(exc);
}
Expand Down
Loading