From 5120b8bd7354bb9ffb2a61f7d5c7f23082cc7b8e Mon Sep 17 00:00:00 2001 From: Evgeny Turnaev Date: Tue, 29 Mar 2016 16:39:51 -0700 Subject: [PATCH 1/3] ujson __json__ attribute logic --- pandas/src/ujson/lib/ultrajson.h | 1 + pandas/src/ujson/lib/ultrajsonenc.c | 24 ++++++++++++ pandas/src/ujson/python/objToJSON.c | 59 +++++++++++++++++++++++++---- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h index f83f74a0fe0da..dc43313266cf2 100644 --- a/pandas/src/ujson/lib/ultrajson.h +++ b/pandas/src/ujson/lib/ultrajson.h @@ -152,6 +152,7 @@ enum JSTYPES JT_LONG, //(JSINT64 (signed 64-bit)) JT_DOUBLE, //(double) JT_UTF8, //(char 8-bit) + JT_RAW, //(raw char 8-bit) __json__ attribute JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure JT_INVALID, // Internal, do not return nor expect diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c index 5e2a226ae8d63..b660d25aad71a 100644 --- a/pandas/src/ujson/lib/ultrajsonenc.c +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -837,6 +837,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) break; } + case JT_UTF8: { value = enc->getStringValue(obj, &tc, &szlen); @@ -870,6 +871,29 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) Buffer_AppendCharUnchecked (enc, '\"'); break; } + + case JT_RAW: + { + value = enc->getStringValue(obj, &tc, &szlen); + if(!value) + { + SetError(obj, enc, "utf-8 encoding error"); + return; + } + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) + { + enc->endTypeContext(obj, &tc); + return; + } + + memcpy(enc->offset, value, szlen); + enc->offset += szlen; + + break; + } + } enc->endTypeContext(obj, &tc); diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index dcb509be696dc..0187d72896d4d 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -111,6 +111,7 @@ typedef struct __TypeContext double doubleValue; JSINT64 longValue; + PyObject *rawJSONValue; char *cStr; NpyArrContext *npyarr; @@ -219,6 +220,7 @@ static TypeContext* createTypeContext(void) pc->index = 0; pc->size = 0; pc->longValue = 0; + pc->rawJSONValue = 0; pc->doubleValue = 0.0; pc->cStr = NULL; pc->npyarr = NULL; @@ -364,6 +366,17 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si return PyString_AS_STRING(newObj); } +static void *PyRawJSONToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = GET_TC(tc)->rawJSONValue; + if (PyUnicode_Check(obj)) { + return PyUnicodeToUTF8(obj, tc, outValue, _outLen); + } + else { + return PyStringToUTF8(obj, tc, outValue, _outLen); + } +} + static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) { int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; @@ -1914,7 +1927,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyString_Check(obj)) + if (PyString_Check(obj) && !PyObject_HasAttrString(obj, "__json__")) { PRINTMARK(); pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; @@ -2359,10 +2372,9 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } - toDictFunc = PyObject_GetAttrString(obj, "toDict"); - - if (toDictFunc) + if (PyObject_HasAttrString(obj, "toDict")) { + toDictFunc = PyObject_GetAttrString(obj, "toDict"); PyObject* tuple = PyTuple_New(0); PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); Py_DECREF(tuple); @@ -2377,9 +2389,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) if (!PyDict_Check(toDictResult)) { - Py_DECREF(toDictResult); - tc->type = JT_NULL; - return; + goto INVALID; } PRINTMARK(); @@ -2392,6 +2402,41 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) pc->dictObj = toDictResult; return; } + else + if (PyObject_HasAttrString(obj, "__json__")) + { + PyObject* toJSONFunc = PyObject_GetAttrString(obj, "__json__"); + PyObject* tuple = PyTuple_New(0); + PyErr_Clear(); + PyObject* toJSONResult = PyObject_Call(toJSONFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toJSONFunc); + + if (toJSONResult == NULL) + { + goto INVALID; + } + + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(toJSONResult); + goto INVALID; + } + + if (!PyString_Check(toJSONResult) && !PyUnicode_Check(toJSONResult)) + { + Py_DECREF(toJSONResult); + PyErr_Format (PyExc_TypeError, "expected string"); + goto INVALID; + } + + PRINTMARK(); + pc->PyTypeToJSON = PyRawJSONToUTF8; + tc->type = JT_RAW; + GET_TC(tc)->rawJSONValue = toJSONResult; + return; + } PyErr_Clear(); From c7c949ae99d880f03088e80c5b2a030e204a2cea Mon Sep 17 00:00:00 2001 From: Evgeny Turnaev Date: Tue, 29 Mar 2016 22:59:35 -0700 Subject: [PATCH 2/3] __json__ attribute tests --- pandas/io/tests/test_json/test_ujson.py | 110 ++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index f5efb54099ddd..5be0c1a0f96bd 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -38,6 +38,7 @@ def _skip_if_python_ver(skip_major, skip_minor=None): else partial(json.dumps, encoding="utf-8")) + class UltraJSONTests(TestCase): def test_encodeDecimal(self): @@ -845,6 +846,18 @@ def test_decodeBigEscape(self): input = quote + (base * 1024 * 1024 * 2) + quote output = ujson.decode(input) # noqa + + def test_object_default(self): + # An object without toDict or __json__ defined should be serialized + # as an empty dict. + class ObjectTest: + pass + + output = ujson.encode(ObjectTest()) + dec = ujson.decode(output) + self.assertEquals(dec, {}) + + def test_toDict(self): d = {u("key"): 31337} @@ -852,12 +865,72 @@ class DictTest: def toDict(self): return d + def __json__(self): + return '"json defined"' # Fallback and shouldn't be called. o = DictTest() output = ujson.encode(o) dec = ujson.decode(output) self.assertEqual(dec, d) + def test_object_with_json(self): + # If __json__ returns a string, then that string + # will be used as a raw JSON snippet in the object. + output_text = 'this is the correct output' + class JSONTest: + def __json__(self): + return '"' + output_text + '"' + + d = {u'key': JSONTest()} + output = ujson.encode(d) + dec = ujson.decode(output) + self.assertEquals(dec, {u'key': output_text}) + + def test_object_with_json_unicode(self): + # If __json__ returns a string, then that string + # will be used as a raw JSON snippet in the object. + output_text = u'this is the correct output' + class JSONTest: + def __json__(self): + return u'"' + output_text + u'"' + + d = {u'key': JSONTest()} + output = ujson.encode(d) + dec = ujson.decode(output) + self.assertEquals(dec, {u'key': output_text}) + + def test_object_with_complex_json(self): + # If __json__ returns a string, then that string + # will be used as a raw JSON snippet in the object. + obj = {u'foo': [u'bar', u'baz']} + class JSONTest: + def __json__(self): + return ujson.encode(obj) + + d = {u'key': JSONTest()} + output = ujson.encode(d) + dec = ujson.decode(output) + self.assertEquals(dec, {u'key': obj}) + + def test_object_with_json_type_error(self): + # __json__ must return a string, otherwise it should raise an error. + for return_value in (None, 1234, 12.34, True, {}): + class JSONTest: + def __json__(self): + return return_value + + d = {u'key': JSONTest()} + self.assertRaises(TypeError, ujson.encode, d) + + def test_object_with_json_attribute_error(self): + # If __json__ raises an error, make sure python actually raises it. + class JSONTest: + def __json__(self): + raise AttributeError + + d = {u'key': JSONTest()} + self.assertRaises(AttributeError, ujson.encode, d) + def test_defaultHandler(self): class _TestObject(object): @@ -1182,6 +1255,7 @@ def testArrayNumpyLabelled(self): class PandasJSONTests(TestCase): + def testDataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1589,6 +1663,42 @@ def test_encodeSet(self): self.assertTrue(v in s) + def test_rawJsonInDataFrame(self): + class ujson_as_is(object): + def __init__(self, value): + self.value = value + def __json__(self): + return self.value + def __eq__(self, other): + return ujson.loads(self.value) == ujson.loads(other.value) + __repr__ = __json__ + + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], index=[ + 'a', 'b'], columns=['w', 'x', 'y', 'z']) + + x_y_ser = df[['x','y']].apply( + lambda x: ujson_as_is(ujson.dumps(x.to_dict())), + axis = 1) + + expected_result = { + 'a': ujson_as_is('{"y":3,"x":2}'), + 'b': ujson_as_is('{"y":7,"x":6}') + } + self.assertEqual(x_y_ser.to_dict(), expected_result) + + df['x_y'] = x_y_ser + ser_x_y_z = df[['x_y', 'z']].apply(lambda x: ujson_as_is(ujson.dumps(x.to_dict())), axis = 1) + df['x_y_z'] = ser_x_y_z + + df_json_dump = df[['x_y_z', 'w']].to_json(orient='records') + + expected_result = '[{"x_y_z":{"z":4,"x_y":{"y":3,"x":2}},"w":1},{"x_y_z":{"z":8,"x_y":{"y":7,"x":6}},"w":5}]' + + self.assertEqual(ujson.loads(df_json_dump), ujson.loads(expected_result)) + + + + def _clean_dict(d): return dict((str(k), v) for k, v in compat.iteritems(d)) From e5e4a04c4b776e86ab390689459550b23eed5d04 Mon Sep 17 00:00:00 2001 From: Evgeny Turnaev Date: Wed, 30 Mar 2016 13:05:05 -0700 Subject: [PATCH 3/3] flake8 thing --- pandas/io/tests/test_json/test_ujson.py | 44 ++++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index 5be0c1a0f96bd..72708b932265b 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -38,7 +38,6 @@ def _skip_if_python_ver(skip_major, skip_minor=None): else partial(json.dumps, encoding="utf-8")) - class UltraJSONTests(TestCase): def test_encodeDecimal(self): @@ -846,7 +845,6 @@ def test_decodeBigEscape(self): input = quote + (base * 1024 * 1024 * 2) + quote output = ujson.decode(input) # noqa - def test_object_default(self): # An object without toDict or __json__ defined should be serialized # as an empty dict. @@ -857,7 +855,6 @@ class ObjectTest: dec = ujson.decode(output) self.assertEquals(dec, {}) - def test_toDict(self): d = {u("key"): 31337} @@ -865,8 +862,9 @@ class DictTest: def toDict(self): return d + def __json__(self): - return '"json defined"' # Fallback and shouldn't be called. + return '"json defined"' # Fallback and shouldn't be called. o = DictTest() output = ujson.encode(o) @@ -877,7 +875,9 @@ def test_object_with_json(self): # If __json__ returns a string, then that string # will be used as a raw JSON snippet in the object. output_text = 'this is the correct output' + class JSONTest: + def __json__(self): return '"' + output_text + '"' @@ -890,7 +890,9 @@ def test_object_with_json_unicode(self): # If __json__ returns a string, then that string # will be used as a raw JSON snippet in the object. output_text = u'this is the correct output' + class JSONTest: + def __json__(self): return u'"' + output_text + u'"' @@ -903,7 +905,9 @@ def test_object_with_complex_json(self): # If __json__ returns a string, then that string # will be used as a raw JSON snippet in the object. obj = {u'foo': [u'bar', u'baz']} + class JSONTest: + def __json__(self): return ujson.encode(obj) @@ -1255,7 +1259,6 @@ def testArrayNumpyLabelled(self): class PandasJSONTests(TestCase): - def testDataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1662,23 +1665,29 @@ def test_encodeSet(self): for v in dec: self.assertTrue(v in s) - def test_rawJsonInDataFrame(self): + class ujson_as_is(object): + def __init__(self, value): self.value = value + def __json__(self): return self.value + def __eq__(self, other): return ujson.loads(self.value) == ujson.loads(other.value) + __repr__ = __json__ - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], index=[ - 'a', 'b'], columns=['w', 'x', 'y', 'z']) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + index=['a', 'b'], + columns=['w', 'x', 'y', 'z']) - x_y_ser = df[['x','y']].apply( - lambda x: ujson_as_is(ujson.dumps(x.to_dict())), - axis = 1) + x_y_ser = df[['x', 'y']].apply( + lambda x: ujson_as_is(ujson.dumps(x.to_dict())), + axis=1 + ) expected_result = { 'a': ujson_as_is('{"y":3,"x":2}'), @@ -1687,16 +1696,19 @@ def __eq__(self, other): self.assertEqual(x_y_ser.to_dict(), expected_result) df['x_y'] = x_y_ser - ser_x_y_z = df[['x_y', 'z']].apply(lambda x: ujson_as_is(ujson.dumps(x.to_dict())), axis = 1) + ser_x_y_z = df[['x_y', 'z']].apply( + lambda x: ujson_as_is(ujson.dumps(x.to_dict())), + axis=1 + ) df['x_y_z'] = ser_x_y_z df_json_dump = df[['x_y_z', 'w']].to_json(orient='records') - expected_result = '[{"x_y_z":{"z":4,"x_y":{"y":3,"x":2}},"w":1},{"x_y_z":{"z":8,"x_y":{"y":7,"x":6}},"w":5}]' - - self.assertEqual(ujson.loads(df_json_dump), ujson.loads(expected_result)) - + expected_result = '[{"x_y_z":{"z":4,"x_y":{"y":3,"x":2}},"w":1}' + \ + ',{"x_y_z":{"z":8,"x_y":{"y":7,"x":6}},"w":5}]' + self.assertEqual(ujson.loads(df_json_dump), + ujson.loads(expected_result)) def _clean_dict(d):