Skip to content

CLN: private funcs in concat.py #36726

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 4, 2020
Merged
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 60 additions & 45 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
import copy
from typing import Dict, List
from typing import Any, Dict, List, Sequence, Tuple, cast

import numpy as np

Expand All @@ -25,6 +25,7 @@

import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray, ExtensionArray
from pandas.core.arrays.sparse.dtype import SparseDtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure if we have a policy, but in general if the type is imported just for type checking, we normally put in an if TYPE_CHECKING: block. If this is not creating import errors then maybe OK. see what others think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonjayhawkins, I updated imports as you suggested, so that SparseDtype is used only for type checking.

from pandas.core.internals.blocks import make_block
from pandas.core.internals.managers import BlockManager

Expand Down Expand Up @@ -344,7 +345,7 @@ def _concatenate_join_units(join_units, concat_axis, copy):
return concat_values


def _get_empty_dtype_and_na(join_units):
def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]:
"""
Return dtype and N/A values to use when concatenating specified units.

Expand Down Expand Up @@ -374,45 +375,8 @@ def _get_empty_dtype_and_na(join_units):
else:
dtypes[i] = unit.dtype

upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue
upcast_classes = _get_upcast_classes(join_units, dtypes)

if is_categorical_dtype(dtype):
upcast_cls = "category"
elif is_datetime64tz_dtype(dtype):
upcast_cls = "datetimetz"

elif is_extension_array_dtype(dtype):
upcast_cls = "extension"

elif issubclass(dtype.type, np.bool_):
upcast_cls = "bool"
elif issubclass(dtype.type, np.object_):
upcast_cls = "object"
elif is_datetime64_dtype(dtype):
upcast_cls = "datetime"
elif is_timedelta64_dtype(dtype):
upcast_cls = "timedelta"
elif is_sparse(dtype):
upcast_cls = dtype.subtype.name
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
upcast_cls = "float"

# Null blocks should not influence upcast class selection, unless there
# are only null blocks, when same upcasting rules must be applied to
# null upcast classes.
if unit.is_na:
null_upcast_classes[upcast_cls].append(dtype)
else:
upcast_classes[upcast_cls].append(dtype)

if not upcast_classes:
upcast_classes = null_upcast_classes
# TODO: de-duplicate with maybe_promote?
# create the result
if "extension" in upcast_classes:
Expand Down Expand Up @@ -441,23 +405,74 @@ def _get_empty_dtype_and_na(join_units):
return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
else: # pragma
try:
g = np.find_common_type(upcast_classes, [])
common_dtype = np.find_common_type(upcast_classes, [])
except TypeError:
# At least one is an ExtensionArray
return np.dtype(np.object_), np.nan
else:
if is_float_dtype(g):
return g, g.type(np.nan)
elif is_numeric_dtype(g):
if is_float_dtype(common_dtype):
return common_dtype, common_dtype.type(np.nan)
elif is_numeric_dtype(common_dtype):
if has_none_blocks:
return np.dtype(np.float64), np.nan
else:
return g, None
return common_dtype, None

msg = "invalid dtype determination in get_concat_dtype"
raise AssertionError(msg)


def _get_upcast_classes(
join_units: Sequence[JoinUnit],
dtypes: Sequence[DtypeObj],
) -> Dict[str, List[DtypeObj]]:
"""Create mapping between upcast class names and lists of dtypes."""
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue

upcast_cls = _select_upcast_cls_from_dtype(dtype)
# Null blocks should not influence upcast class selection, unless there
# are only null blocks, when same upcasting rules must be applied to
# null upcast classes.
if unit.is_na:
null_upcast_classes[upcast_cls].append(dtype)
else:
upcast_classes[upcast_cls].append(dtype)

if not upcast_classes:
upcast_classes = null_upcast_classes

return upcast_classes


def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
"""Select upcast class name based on dtype."""
if is_categorical_dtype(dtype):
return "category"
elif is_datetime64tz_dtype(dtype):
return "datetimetz"
elif is_extension_array_dtype(dtype):
return "extension"
elif issubclass(dtype.type, np.bool_):
return "bool"
elif issubclass(dtype.type, np.object_):
return "object"
elif is_datetime64_dtype(dtype):
return "datetime"
elif is_timedelta64_dtype(dtype):
return "timedelta"
elif is_sparse(dtype):
dtype = cast(SparseDtype, dtype)
return dtype.subtype.name
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
return dtype.name
else:
return "float"


def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
"""
Check if the join units consist of blocks of uniform type that can
Expand Down