Skip to content

Commit 7d27588

Browse files
authored
REF: reindex_indexer use np.void to avoid JoinUnit (#43376)
1 parent 45982d8 commit 7d27588

File tree

2 files changed

+80
-8
lines changed

2 files changed

+80
-8
lines changed

pandas/core/internals/concat.py

+52-5
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ def concatenate_managers(
198198
if isinstance(mgrs_indexers[0][0], ArrayManager):
199199
return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
200200

201+
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
202+
201203
concat_plans = [
202204
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
203205
]
@@ -245,6 +247,38 @@ def concatenate_managers(
245247
return BlockManager(tuple(blocks), axes)
246248

247249

250+
def _maybe_reindex_columns_na_proxy(
251+
axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
252+
) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
253+
"""
254+
Reindex along columns so that all of the BlockManagers being concatenated
255+
have matching columns.
256+
257+
Columns added in this reindexing have dtype=np.void, indicating they
258+
should be ignored when choosing a column's final dtype.
259+
"""
260+
new_mgrs_indexers = []
261+
for mgr, indexers in mgrs_indexers:
262+
# We only reindex for axis=0 (i.e. columns), as this can be done cheaply
263+
if 0 in indexers:
264+
new_mgr = mgr.reindex_indexer(
265+
axes[0],
266+
indexers[0],
267+
axis=0,
268+
copy=False,
269+
only_slice=True,
270+
allow_dups=True,
271+
use_na_proxy=True,
272+
)
273+
new_indexers = indexers.copy()
274+
del new_indexers[0]
275+
new_mgrs_indexers.append((new_mgr, new_indexers))
276+
else:
277+
new_mgrs_indexers.append((mgr, indexers))
278+
279+
return new_mgrs_indexers
280+
281+
248282
def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
249283
"""
250284
Construct concatenation plan for given block manager and indexers.
@@ -375,6 +409,8 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
375409
return False
376410
if self.block is None:
377411
return True
412+
if self.block.dtype.kind == "V":
413+
return True
378414

379415
if self.dtype == object:
380416
values = self.block.values
@@ -401,6 +437,8 @@ def is_na(self) -> bool:
401437
blk = self.block
402438
if blk is None:
403439
return True
440+
if blk.dtype.kind == "V":
441+
return True
404442

405443
if not blk._can_hold_na:
406444
return False
@@ -426,7 +464,7 @@ def is_na(self) -> bool:
426464
return all(isna_all(row) for row in values)
427465

428466
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
429-
if upcasted_na is None:
467+
if upcasted_na is None and self.block.dtype.kind != "V":
430468
# No upcasting is necessary
431469
fill_value = self.block.fill_value
432470
values = self.block.get_values()
@@ -435,6 +473,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
435473

436474
if self._is_valid_na_for(empty_dtype):
437475
# note: always holds when self.block is None
476+
# or self.block.dtype.kind == "V"
438477
blk_dtype = getattr(self.block, "dtype", None)
439478

440479
if blk_dtype == np.dtype("object"):
@@ -512,7 +551,9 @@ def _concatenate_join_units(
512551

513552
empty_dtype = _get_empty_dtype(join_units)
514553

515-
has_none_blocks = any(unit.block is None for unit in join_units)
554+
has_none_blocks = any(
555+
unit.block is None or unit.block.dtype.kind == "V" for unit in join_units
556+
)
516557
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
517558

518559
to_concat = [
@@ -597,13 +638,19 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
597638
empty_dtype = join_units[0].block.dtype
598639
return empty_dtype
599640

600-
has_none_blocks = any(unit.block is None for unit in join_units)
641+
has_none_blocks = any(
642+
unit.block is None or unit.block.dtype.kind == "V" for unit in join_units
643+
)
601644

602645
dtypes = [
603646
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
604647
]
605648
if not len(dtypes):
606-
dtypes = [unit.dtype for unit in join_units if unit.block is not None]
649+
dtypes = [
650+
unit.dtype
651+
for unit in join_units
652+
if unit.block is not None and unit.block.dtype.kind != "V"
653+
]
607654

608655
dtype = find_common_type(dtypes)
609656
if has_none_blocks:
@@ -619,7 +666,7 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
619666
620667
"""
621668
first = join_units[0].block
622-
if first is None:
669+
if first is None or first.dtype.kind == "V":
623670
return False
624671
return (
625672
# exclude cases where a) ju.block is None or b) we have e.g. Int64+int64

pandas/core/internals/managers.py

+28-3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
from pandas.core.internals.blocks import (
6767
Block,
6868
DatetimeTZBlock,
69+
NumpyBlock,
6970
ensure_block_shape,
7071
extend_blocks,
7172
get_block_type,
@@ -613,6 +614,8 @@ def reindex_indexer(
613614
copy: bool = True,
614615
consolidate: bool = True,
615616
only_slice: bool = False,
617+
*,
618+
use_na_proxy: bool = False,
616619
) -> T:
617620
"""
618621
Parameters
@@ -627,6 +630,8 @@ def reindex_indexer(
627630
Whether to consolidate inplace before reindexing.
628631
only_slice : bool, default False
629632
Whether to take views, not copies, along columns.
633+
use_na_proxy : bool, default False
634+
Whether to use a np.void ndarray for newly introduced columns.
630635
631636
pandas-indexer with -1's only.
632637
"""
@@ -651,7 +656,10 @@ def reindex_indexer(
651656

652657
if axis == 0:
653658
new_blocks = self._slice_take_blocks_ax0(
654-
indexer, fill_value=fill_value, only_slice=only_slice
659+
indexer,
660+
fill_value=fill_value,
661+
only_slice=only_slice,
662+
use_na_proxy=use_na_proxy,
655663
)
656664
else:
657665
new_blocks = [
@@ -675,6 +683,8 @@ def _slice_take_blocks_ax0(
675683
slice_or_indexer: slice | np.ndarray,
676684
fill_value=lib.no_default,
677685
only_slice: bool = False,
686+
*,
687+
use_na_proxy: bool = False,
678688
) -> list[Block]:
679689
"""
680690
Slice/take blocks along axis=0.
@@ -688,6 +698,8 @@ def _slice_take_blocks_ax0(
688698
only_slice : bool, default False
689699
If True, we always return views on existing arrays, never copies.
690700
This is used when called from ops.blockwise.operate_blockwise.
701+
use_na_proxy : bool, default False
702+
Whether to use a np.void ndarray for newly introduced columns.
691703
692704
Returns
693705
-------
@@ -756,7 +768,11 @@ def _slice_take_blocks_ax0(
756768
# If we've got here, fill_value was not lib.no_default
757769

758770
blocks.append(
759-
self._make_na_block(placement=mgr_locs, fill_value=fill_value)
771+
self._make_na_block(
772+
placement=mgr_locs,
773+
fill_value=fill_value,
774+
use_na_proxy=use_na_proxy,
775+
)
760776
)
761777
else:
762778
blk = self.blocks[blkno]
@@ -798,7 +814,16 @@ def _slice_take_blocks_ax0(
798814

799815
return blocks
800816

801-
def _make_na_block(self, placement: BlockPlacement, fill_value=None) -> Block:
817+
def _make_na_block(
818+
self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
819+
) -> Block:
820+
821+
if use_na_proxy:
822+
assert fill_value is None
823+
shape = (len(placement), self.shape[1])
824+
vals = np.empty(shape, dtype=np.void)
825+
nb = NumpyBlock(vals, placement, ndim=2)
826+
return nb
802827

803828
if fill_value is None:
804829
fill_value = np.nan

0 commit comments

Comments
 (0)