From df8d226b2011cc40cd4aea6c489f6936663e1a0d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Apr 2021 14:56:02 +0200 Subject: [PATCH 1/7] REF: remove Block access in the JSON writing code --- pandas/_libs/src/ujson/python/objToJSON.c | 111 +++++------------- pandas/core/internals/array_manager.py | 4 + pandas/core/internals/blocks.py | 8 -- pandas/core/internals/managers.py | 4 + .../tests/io/json/test_json_table_schema.py | 4 - pandas/tests/io/json/test_pandas.py | 2 - 6 files changed, 37 insertions(+), 96 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index bbcee479aeb5a..a9f866ea74df0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -294,7 +294,12 @@ static int is_simple_frame(PyObject *obj) { if (!mgr) { return 0; } - int ret = (get_attr_length(mgr, "blocks") <= 1); + int ret; + if (PyObject_HasAttrString(mgr, "blocks")) { + ret = (get_attr_length(mgr, "blocks") <= 1); + } else { + ret = 0; + } Py_DECREF(mgr); return ret; @@ -656,16 +661,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; + PyObject *obj, *values, *arrays, *array; PdBlockContext *blkCtxt; NpyArrContext *npyarr; Py_ssize_t i; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; obj = (PyObject *)_obj; @@ -708,97 +707,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - blocks = get_sub_attr(obj, "_mgr", "blocks"); - if (!blocks) { + arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; - } else if (!PyTuple_Check(blocks)) { - PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); - goto BLKRET; } - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) { - block = PyTuple_GET_ITEM(blocks, i); - if (!block) { + for (i = 0; i < PyObject_Length(arrays); i++) { + array = PyList_GET_ITEM(arrays, i); + if (!array) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); - if (!tmp) { + // ensure we have a numpy array (i.e. np.asarray) + values = PyObject_CallMethod(array, "__array__", NULL); + if ((!values) || (!PyArray_CheckExact(values))) { + // Didn't get a numpy array ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - values = PyArray_Transpose((PyArrayObject *)tmp, NULL); - Py_DECREF(tmp); - if (!values) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) { - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, NULL); - if (!iter) { - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); - do { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); + GET_TC(tc)->newObj = values; - blkCtxt->cindices[colIdx] = idx; + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); + npyarr = GET_TC(tc)->npyarr; - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; - - // set the dataptr to our desired column and initialise - if (npyarr != NULL) { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); - } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; - } while (iternext(iter)); + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); + blkCtxt->npyCtxts[i] = npyarr; + GET_TC(tc)->newObj = NULL; } GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + goto ARR_RET; -BLKRET: - Py_DECREF(blocks); +ARR_RET: + Py_DECREF(arrays); } void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 8c9902d330eee..8606db70f8f0a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -919,6 +919,10 @@ def iget_values(self, i: int) -> ArrayLike: """ return self.arrays[i] + @property + def column_arrays(self) -> list[ArrayLike]: + return self.arrays + def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ Set new column(s). diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 603cc6a6ff1f2..c945694396ec0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -224,14 +224,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: # expected "ndarray") return self.values # type: ignore[return-value] - @final - def get_block_values_for_json(self) -> np.ndarray: - """ - This is used in the JSON C code. - """ - # TODO(EA2D): reshape will be unnecessary with 2D EAs - return np.asarray(self.values).reshape(self.shape) - @final @cache_readonly def fill_value(self): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 97d605e2fa2d1..98a256ce592ea 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1137,6 +1137,10 @@ def iget_values(self, i: int) -> ArrayLike: values = block.iget(self.blklocs[i]) return values + @property + def column_arrays(self) -> list[ArrayLike]: + return [self.iget_values(i) for i in range(len(self.items))] + def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ Set new item in-place. Does not consolidate. Adds new Block if not diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 9d955545aede3..71f1d03ea6d1f 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,8 +6,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -26,8 +24,6 @@ set_default_names, ) -pytestmark = td.skip_array_manager_not_yet_implemented - class TestBuildSchema: def setup_method(self, method): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 97d44aafef74b..dc94354728ef6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -857,8 +857,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(dumps(data))[["id", infer_word]] tm.assert_frame_equal(result, expected) - # TODO(ArrayManager) JSON - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "date,date_unit", [ From e0f32f28f0cb0eec53ea4139138b78c6c2f181ea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Apr 2021 18:16:45 +0200 Subject: [PATCH 2/7] optimize BlockManager.column_arrays --- pandas/core/internals/managers.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 98a256ce592ea..d71dadaa7dacb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1139,7 +1139,16 @@ def iget_values(self, i: int) -> ArrayLike: @property def column_arrays(self) -> list[ArrayLike]: - return [self.iget_values(i) for i in range(len(self.items))] + arrays = [np.asarray(arr) for arr in self.arrays] + result = [] + for i in range(len(self.items)): + arr = arrays[self.blknos[i]] + if arr.ndim == 2: + values = arr[self.blklocs[i]] + else: + values = arr + result.append(values) + return result def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ From ce2f9d0f2443b78ed965176a0f6ff66269119184 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Apr 2021 21:20:50 +0200 Subject: [PATCH 3/7] clean: remove cindices --- pandas/_libs/src/ujson/python/objToJSON.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a9f866ea74df0..31b43cdb28d9d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -83,7 +83,6 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; @@ -686,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { if (blkCtxt->ncols == 0) { blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; @@ -700,13 +698,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - arrays = get_sub_attr(obj, "_mgr", "column_arrays"); if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; @@ -777,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { if (blkCtxt->npyCtxts) { PyObject_Free(blkCtxt->npyCtxts); } - if (blkCtxt->cindices) { - PyObject_Free(blkCtxt->cindices); - } PyObject_Free(blkCtxt); } } From 14fa6d3b54047700ff48043426ebfdc92cd95073 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Apr 2021 21:25:02 +0200 Subject: [PATCH 4/7] add docstring to column_arrays --- pandas/core/internals/array_manager.py | 3 +++ pandas/core/internals/managers.py | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 8606db70f8f0a..a25750e7e1eab 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -921,6 +921,9 @@ def iget_values(self, i: int) -> ArrayLike: @property def column_arrays(self) -> list[ArrayLike]: + """ + Used in the JSON C code to access column arrays. + """ return self.arrays def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d71dadaa7dacb..bb757a5b637a1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1138,8 +1138,13 @@ def iget_values(self, i: int) -> ArrayLike: return values @property - def column_arrays(self) -> list[ArrayLike]: - arrays = [np.asarray(arr) for arr in self.arrays] + def column_arrays(self) -> list[np.ndarray]: + """ + Used in the JSON C code to access column arrays. + This optimizes compared to using `iget_values` by converting each + block.values to a np.ndarray only once up front + """ + arrays = [np.asarray(blk.values) for blk in self.blocks] result = [] for i in range(len(self.items)): arr = arrays[self.blknos[i]] From 03ade01797aa4bd1d432d1f97586524ab1971896 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 09:25:04 +0200 Subject: [PATCH 5/7] add check for datetimetz --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b06280a9e8a0a..8f0de56043327 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1160,7 +1160,10 @@ def column_arrays(self) -> list[np.ndarray]: This optimizes compared to using `iget_values` by converting each block.values to a np.ndarray only once up front """ - arrays = [np.asarray(blk.values) for blk in self.blocks] + arrays = [ + blk._ndarray if blk.is_datetimetz else np.asarray(blk.values) + for blk in self.blocks + ] result = [] for i in range(len(self.items)): arr = arrays[self.blknos[i]] From aad5c32e48c5cd1f781e0c349ee4112750d30875 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 09:27:26 +0200 Subject: [PATCH 6/7] add comment about special case --- pandas/core/internals/managers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8f0de56043327..a9ce4216b12d5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1160,8 +1160,9 @@ def column_arrays(self) -> list[np.ndarray]: This optimizes compared to using `iget_values` by converting each block.values to a np.ndarray only once up front """ + # special casing datetimetz to avoid conversion through object dtype arrays = [ - blk._ndarray if blk.is_datetimetz else np.asarray(blk.values) + blk._ndarray if isinstance(blk, DatetimeTZBlock) else np.asarray(blk.values) for blk in self.blocks ] result = [] From b19fc1ffb2f614de0c931b0de2d69c7786b2f781 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 09:36:57 +0200 Subject: [PATCH 7/7] fixup --- pandas/core/internals/managers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a9ce4216b12d5..487047f1a1dbb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1162,7 +1162,9 @@ def column_arrays(self) -> list[np.ndarray]: """ # special casing datetimetz to avoid conversion through object dtype arrays = [ - blk._ndarray if isinstance(blk, DatetimeTZBlock) else np.asarray(blk.values) + blk.values._ndarray + if isinstance(blk, DatetimeTZBlock) + else np.asarray(blk.values) for blk in self.blocks ] result = []