Skip to content

Switch dataframe constructor to use dispatch #32844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ Other enhancements
- :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
- You can now override how Pandas constructs DataFrames from custom objects, by registering a new function on the
``pandas.core.internals.construction.create_dataframe`` ``singledispatch`` function.
-

.. ---------------------------------------------------------------------------
Expand Down
108 changes: 8 additions & 100 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import warnings

import numpy as np
import numpy.ma as ma

from pandas._config import get_option

Expand Down Expand Up @@ -67,7 +66,6 @@
maybe_convert_platform,
maybe_downcast_to_dtype,
maybe_infer_to_datetimelike,
maybe_upcast,
maybe_upcast_putmask,
validate_numeric_casting,
)
Expand All @@ -77,7 +75,6 @@
ensure_platform_int,
infer_dtype_from_object,
is_bool_dtype,
is_dataclass,
is_datetime64_any_dtype,
is_dict_like,
is_dtype_equal,
Expand All @@ -88,7 +85,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_object_dtype,
is_period_dtype,
is_scalar,
Expand All @@ -105,7 +101,7 @@

from pandas.core import algorithms, common as com, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.generic import NDFrame, _shared_docs
Expand All @@ -115,14 +111,9 @@
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
from pandas.core.internals.construction import (
arrays_to_mgr,
dataclasses_to_dicts,
get_names_from_index,
init_dict,
init_ndarray,
masked_rec_array_to_mgr,
create_dataframe,
reorder_arrays,
sanitize_index,
to_arrays,
Expand Down Expand Up @@ -427,97 +418,9 @@ def __init__(
dtype: Optional[Dtype] = None,
copy: bool = False,
):
if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

if isinstance(data, DataFrame):
data = data._data

if isinstance(data, BlockManager):
mgr = self._init_mgr(
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
)
elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)

# a masked array
else:
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = maybe_upcast(data, copy=True)
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
elif getattr(data, "name", None) is not None:
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

# For data is list-like, or Iterable (will consume into list)
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = dataclasses_to_dicts(data)
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
"DataFrame constructor called with "
f"incompatible data and dtype: {err}"
)
raise exc from err

if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array(
(len(index), len(columns)), data, dtype=dtype
)
mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
)
else:
raise ValueError("DataFrame constructor not properly called!")

mgr = create_dataframe(data, index, columns, dtype, copy, type(self))
NDFrame.__init__(self, mgr)

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -8548,6 +8451,11 @@ def isin(self, values) -> "DataFrame":
ops.add_special_arithmetic_methods(DataFrame)


@create_dataframe.register
def _create_dataframe_dataframe(data: DataFrame, *args, **kwargs):
return create_dataframe(data._data, *args, **kwargs)


def _from_nested_dict(data):
# TODO: this should be seriously cythonized
new_data = collections.defaultdict(dict)
Expand Down
141 changes: 140 additions & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@
constructors before passing them to a BlockManager.
"""
from collections import abc
import functools
from typing import Any, List, Optional, Type, Union, cast

import numpy as np
import numpy.ma as ma
import numpy.ma.mrecords as mrecords

from pandas._libs import lib
from pandas._typing import Axes, Dtype

from pandas.core.dtypes.cast import (
cast_scalar_to_array,
construct_1d_arraylike_from_scalar,
maybe_cast_to_datetime,
maybe_convert_platform,
Expand All @@ -18,11 +23,13 @@
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_dataclass,
is_datetime64tz_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.generic import (
Expand All @@ -35,8 +42,9 @@
)

from pandas.core import algorithms, common as com
from pandas.core.arrays import Categorical
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.construction import extract_array, sanitize_array
from pandas.core.generic import NDFrame
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
Index,
Expand All @@ -45,9 +53,11 @@
union_indexes,
)
from pandas.core.internals import (
BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)
from pandas.core.series import Series

# ---------------------------------------------------------------------
# BlockManager Interface
Expand Down Expand Up @@ -115,6 +125,135 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool):
return mgr


@functools.singledispatch
def create_dataframe(
data: Any,
index: Optional[Axes],
columns: Optional[Axes],
dtype: Optional[Dtype],
copy: bool,
cls: Type[NDFrame],
) -> BlockManager:
"""
Create a BlockManager for some given data. Used inside the DataFrame constructor
to convert different input types.
If you want to provide a custom way to convert from your objec to a DataFrame
you can register a dispatch on this function.
"""
# Base case is to try to cast to NumPy array
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
f"DataFrame constructor called with incompatible data and dtype: {err}"
)
raise exc from err

if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype)
return init_ndarray(values, index, columns, dtype=values.dtype, copy=False)
else:
raise ValueError("DataFrame constructor not properly called!")


@create_dataframe.register
def _create_dataframe_none(data: None, *args, **kwargs):
return create_dataframe({}, *args, **kwargs)


@create_dataframe.register
def _create_dataframe_blockmanager(
data: BlockManager, index, columns, dtype, copy, cls
):
return cls._init_mgr(
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
)


@create_dataframe.register
def _create_dataframe_dict(data: dict, index, columns, dtype, copy, cls):
return init_dict(data, index, columns, dtype=dtype)


@create_dataframe.register
def _create_dataframe_masked_array(
data: ma.MaskedArray, index, columns, dtype, copy, cls
):
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = maybe_upcast(data, copy=True)
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)


@create_dataframe.register
def _create_dataframe_masked_record(
data: mrecords.MaskedRecords, index, columns, dtype, copy, cls
):
return masked_rec_array_to_mgr(data, index, columns, dtype, copy)


@create_dataframe.register(np.ndarray)
@create_dataframe.register(Series)
@create_dataframe.register(Index)
def _create_dataframe_array_series_index(
data: Union[np.ndarray, Series, Index], index, columns, dtype, copy, cls
):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
return init_dict(data, index, columns, dtype=dtype)
elif getattr(data, "name", None) is not None:
return init_dict({data.name: data}, index, columns, dtype=dtype)
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)


class _IterableExceptStringOrBytesMeta(type):
def __subclasscheck__(cls, sub: Type) -> bool:
return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable)


class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta):
"""
Class that is subclass of iterable but not of str or bytes to use for singledispatch
registration
"""

pass


@create_dataframe.register(_IterableExceptStringOrBytes)
def _create_dataframe_iterable(data: abc.Iterable, index, columns, dtype, copy, cls):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = cast(List[dict], dataclasses_to_dicts(data))
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
return init_dict({}, index, columns, dtype=dtype)


# ---------------------------------------------------------------------
# DataFrame Constructor Interface

Expand Down
Loading