Merge remote-tracking branch 'upstream/master' into styler_format_index

attack68 · attack68 · commit 7e6bf97f28ed · 2021-09-08T17:40:01.000+02:00
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -896,7 +896,7 @@ def register_converter_cb(key):
         "latex.multicol_align",
         "r",
         styler_multicol_align,
-        validator=is_one_of_factory(["r", "c", "l"]),
+        validator=is_one_of_factory(["r", "c", "l", "naive-l", "naive-r"]),
     )
 
     cf.register_option(
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -469,7 +469,8 @@ class DataFrame(NDFrame, OpsMixin):
     ----------
     data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
         Dict can contain Series, arrays, constants, dataclass or list-like objects. If
-        data is a dict, column order follows insertion-order.
+        data is a dict, column order follows insertion-order. If a dict contains Series
+        which have an index defined, it is aligned by its index.
 
         .. versionchanged:: 0.25.0
            If data is a list of dicts, column order follows insertion-order.
@@ -524,6 +525,16 @@ class DataFrame(NDFrame, OpsMixin):
     col2    int8
     dtype: object
 
+    Constructing DataFrame from a dictionary including Series:
+
+    >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
+    >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
+       col1  col2
+    0     0   NaN
+    1     1   NaN
+    2     2   2.0
+    3     3   3.0
+
     Constructing DataFrame from numpy ndarray:
 
     >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -198,6 +198,8 @@ def concatenate_managers(
     if isinstance(mgrs_indexers[0][0], ArrayManager):
         return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
 
+    mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
+
     concat_plans = [
         _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
     ]
@@ -245,6 +247,38 @@ def concatenate_managers(
     return BlockManager(tuple(blocks), axes)
 
 
+def _maybe_reindex_columns_na_proxy(
+    axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
+) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
+    """
+    Reindex along columns so that all of the BlockManagers being concatenated
+    have matching columns.
+
+    Columns added in this reindexing have dtype=np.void, indicating they
+    should be ignored when choosing a column's final dtype.
+    """
+    new_mgrs_indexers = []
+    for mgr, indexers in mgrs_indexers:
+        # We only reindex for axis=0 (i.e. columns), as this can be done cheaply
+        if 0 in indexers:
+            new_mgr = mgr.reindex_indexer(
+                axes[0],
+                indexers[0],
+                axis=0,
+                copy=False,
+                only_slice=True,
+                allow_dups=True,
+                use_na_proxy=True,
+            )
+            new_indexers = indexers.copy()
+            del new_indexers[0]
+            new_mgrs_indexers.append((new_mgr, new_indexers))
+        else:
+            new_mgrs_indexers.append((mgr, indexers))
+
+    return new_mgrs_indexers
+
+
 def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
     """
     Construct concatenation plan for given block manager and indexers.
@@ -375,6 +409,8 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
             return False
         if self.block is None:
             return True
+        if self.block.dtype.kind == "V":
+            return True
 
         if self.dtype == object:
             values = self.block.values
@@ -401,6 +437,8 @@ def is_na(self) -> bool:
         blk = self.block
         if blk is None:
             return True
+        if blk.dtype.kind == "V":
+            return True
 
         if not blk._can_hold_na:
             return False
@@ -426,7 +464,7 @@ def is_na(self) -> bool:
             return all(isna_all(row) for row in values)
 
     def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
-        if upcasted_na is None:
+        if upcasted_na is None and self.block.dtype.kind != "V":
             # No upcasting is necessary
             fill_value = self.block.fill_value
             values = self.block.get_values()
@@ -435,6 +473,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
 
             if self._is_valid_na_for(empty_dtype):
                 # note: always holds when self.block is None
+                #  or self.block.dtype.kind == "V"
                 blk_dtype = getattr(self.block, "dtype", None)
 
                 if blk_dtype == np.dtype("object"):
@@ -512,7 +551,9 @@ def _concatenate_join_units(
 
     empty_dtype = _get_empty_dtype(join_units)
 
-    has_none_blocks = any(unit.block is None for unit in join_units)
+    has_none_blocks = any(
+        unit.block is None or unit.block.dtype.kind == "V" for unit in join_units
+    )
     upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
 
     to_concat = [
@@ -597,13 +638,19 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
         empty_dtype = join_units[0].block.dtype
         return empty_dtype
 
-    has_none_blocks = any(unit.block is None for unit in join_units)
+    has_none_blocks = any(
+        unit.block is None or unit.block.dtype.kind == "V" for unit in join_units
+    )
 
     dtypes = [
         unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
     ]
     if not len(dtypes):
-        dtypes = [unit.dtype for unit in join_units if unit.block is not None]
+        dtypes = [
+            unit.dtype
+            for unit in join_units
+            if unit.block is not None and unit.block.dtype.kind != "V"
+        ]
 
     dtype = find_common_type(dtypes)
     if has_none_blocks:
@@ -619,7 +666,7 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
 
     """
     first = join_units[0].block
-    if first is None:
+    if first is None or first.dtype.kind == "V":
         return False
     return (
         # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -66,6 +66,7 @@
 from pandas.core.internals.blocks import (
     Block,
     DatetimeTZBlock,
+    NumpyBlock,
     ensure_block_shape,
     extend_blocks,
     get_block_type,
@@ -613,6 +614,8 @@ def reindex_indexer(
         copy: bool = True,
         consolidate: bool = True,
         only_slice: bool = False,
+        *,
+        use_na_proxy: bool = False,
     ) -> T:
         """
         Parameters
@@ -627,6 +630,8 @@ def reindex_indexer(
             Whether to consolidate inplace before reindexing.
         only_slice : bool, default False
             Whether to take views, not copies, along columns.
+        use_na_proxy : bool, default False
+            Whether to use a np.void ndarray for newly introduced columns.
 
         pandas-indexer with -1's only.
         """
@@ -651,7 +656,10 @@ def reindex_indexer(
 
         if axis == 0:
             new_blocks = self._slice_take_blocks_ax0(
-                indexer, fill_value=fill_value, only_slice=only_slice
+                indexer,
+                fill_value=fill_value,
+                only_slice=only_slice,
+                use_na_proxy=use_na_proxy,
             )
         else:
             new_blocks = [
@@ -675,6 +683,8 @@ def _slice_take_blocks_ax0(
         slice_or_indexer: slice | np.ndarray,
         fill_value=lib.no_default,
         only_slice: bool = False,
+        *,
+        use_na_proxy: bool = False,
     ) -> list[Block]:
         """
         Slice/take blocks along axis=0.
@@ -688,6 +698,8 @@ def _slice_take_blocks_ax0(
         only_slice : bool, default False
             If True, we always return views on existing arrays, never copies.
             This is used when called from ops.blockwise.operate_blockwise.
+        use_na_proxy : bool, default False
+            Whether to use a np.void ndarray for newly introduced columns.
 
         Returns
         -------
@@ -756,7 +768,11 @@ def _slice_take_blocks_ax0(
                 # If we've got here, fill_value was not lib.no_default
 
                 blocks.append(
-                    self._make_na_block(placement=mgr_locs, fill_value=fill_value)
+                    self._make_na_block(
+                        placement=mgr_locs,
+                        fill_value=fill_value,
+                        use_na_proxy=use_na_proxy,
+                    )
                 )
             else:
                 blk = self.blocks[blkno]
@@ -798,7 +814,16 @@ def _slice_take_blocks_ax0(
 
         return blocks
 
-    def _make_na_block(self, placement: BlockPlacement, fill_value=None) -> Block:
+    def _make_na_block(
+        self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
+    ) -> Block:
+
+        if use_na_proxy:
+            assert fill_value is None
+            shape = (len(placement), self.shape[1])
+            vals = np.empty(shape, dtype=np.void)
+            nb = NumpyBlock(vals, placement, ndim=2)
+            return nb
 
         if fill_value is None:
             fill_value = np.nan
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -527,10 +527,13 @@ def to_latex(
             without multirow.
 
             .. versionchanged:: 1.4.0
-        multicol_align : {"r", "c", "l"}, optional
+        multicol_align : {"r", "c", "l", "naive-l", "naive-r"}, optional
             If sparsifying hierarchical MultiIndex columns whether to align text at
             the left, centrally, or at the right. If not given defaults to
-            ``pandas.options.styler.latex.multicol_align``
+            ``pandas.options.styler.latex.multicol_align``. If a naive option is
+            given renders without multicol.
+
+            .. versionchanged:: 1.4.0
         siunitx : bool, default False
             Set to ``True`` to structure LaTeX compatible with the {siunitx} package.
         environment : str, optional
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
@@ -1622,6 +1622,14 @@ def _parse_latex_header_span(
         if 'colspan="' in attrs:
             colspan = attrs[attrs.find('colspan="') + 9 :]  # len('colspan="') = 9
             colspan = int(colspan[: colspan.find('"')])
+            if "naive-l" == multicol_align:
+                out = f"{{{display_val}}}" if wrap else f"{display_val}"
+                blanks = " & {}" if wrap else " &"
+                return out + blanks * (colspan - 1)
+            elif "naive-r" == multicol_align:
+                out = f"{{{display_val}}}" if wrap else f"{display_val}"
+                blanks = "{} & " if wrap else "& "
+                return blanks * (colspan - 1) + out
             return f"\\multicolumn{{{colspan}}}{{{multicol_align}}}{{{display_val}}}"
         elif 'rowspan="' in attrs:
             if multirow_align == "naive":
diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py
@@ -302,6 +302,35 @@ def test_multiindex_row_and_col(df_ext):
     assert result == expected
 
 
+@pytest.mark.parametrize(
+    "multicol_align, siunitx, header",
+    [
+        ("naive-l", False, " & A & &"),
+        ("naive-r", False, " & & & A"),
+        ("naive-l", True, "{} & {A} & {} & {}"),
+        ("naive-r", True, "{} & {} & {} & {A}"),
+    ],
+)
+def test_multicol_naive(df, multicol_align, siunitx, header):
+    ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("A", "c")])
+    df.columns = ridx
+    level1 = " & a & b & c" if not siunitx else "{} & {a} & {b} & {c}"
+    col_format = "lrrl" if not siunitx else "lSSl"
+    expected = dedent(
+        f"""\
+        \\begin{{tabular}}{{{col_format}}}
+        {header} \\\\
+        {level1} \\\\
+        0 & 0 & -0.61 & ab \\\\
+        1 & 1 & -1.22 & cd \\\\
+        \\end{{tabular}}
+        """
+    )
+    styler = df.style.format(precision=2)
+    result = styler.to_latex(multicol_align=multicol_align, siunitx=siunitx)
+    assert expected == result
+
+
 def test_multi_options(df_ext):
     cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")])
     ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")])

Original file line number	Diff line number	Diff line change
`@@ -896,7 +896,7 @@ def register_converter_cb(key):`
`896`	`896`	`"latex.multicol_align",`
`897`	`897`	`"r",`
`898`	`898`	`styler_multicol_align,`
`899`		`- validator=is_one_of_factory(["r", "c", "l"]),`
	`899`	`+ validator=is_one_of_factory(["r", "c", "l", "naive-l", "naive-r"]),`
`900`	`900`	`)`
`901`	`901`
`902`	`902`	`cf.register_option(`