diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cbad169fe4d56..8a98dcc38e786 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -875,7 +875,7 @@ I/O - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`) - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`) - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`) -- +- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) Period ^^^^^^ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index d4ec21f38cdad..a6f18e0aec4d9 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -332,9 +332,18 @@ static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), +static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { - return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); + char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, + (Py_ssize_t *)_outLen); + if (encoded == NULL) { + /* Something went wrong. + Set errorMsg(to tell encoder to stop), + and let Python exception propagate. */ + JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; + enc->errorMsg = "Encoding failed."; + } + return encoded; } /* JSON callback. returns a char* and mutates the pointer to *len */ diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 3c841d829efd7..a5d7a16f77a72 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -291,6 +291,15 @@ def test_encode_unicode_4bytes_utf8highest(self): assert enc == json.dumps(four_bytes_input) assert dec == json.loads(enc) + def test_encode_unicode_error(self): + string = "'\udac0'" + msg = ( + r"'utf-8' codec can't encode character '\\udac0' " + r"in position 1: surrogates not allowed" + ) + with pytest.raises(UnicodeEncodeError, match=msg): + ujson.dumps([string]) + def test_encode_array_in_array(self): arr_in_arr_input = [[[[]]]] output = ujson.encode(arr_in_arr_input)