Skip to content

GH-35081: [Python] construct pandas.DataFrame with public API in to_pandas #40897

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
7 changes: 6 additions & 1 deletion python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype, _lock
bint has_sparse
bint _pd024
bint _is_v1, _is_ge_v21
bint _is_v1, _is_ge_v21, _is_ge_v3

def __init__(self):
self._lock = Lock()
Expand Down Expand Up @@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):

self._is_v1 = self._loose_version < Version('2.0.0')
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
Expand Down Expand Up @@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._is_ge_v21

def is_ge_v3(self):
self._check_import()
return self._is_ge_v3

@property
def categorical_type(self):
self._check_import()
Expand Down
75 changes: 42 additions & 33 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
# Converting pyarrow.Table efficiently to pandas.DataFrame


def _reconstruct_block(item, columns=None, extension_columns=None):
def _reconstruct_block(item, columns=None, extension_columns=None, return_block=True):
"""
Construct a pandas Block from the `item` dictionary coming from pyarrow's
serialization or returned by arrow::python::ConvertTableToPandas.
Expand Down Expand Up @@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
block_arr = item.get('block', None)
placement = item['placement']
if 'dictionary' in item:
cat = _pandas_api.categorical_type.from_codes(
arr = _pandas_api.categorical_type.from_codes(
block_arr, categories=item['dictionary'],
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
unit, _ = np.datetime_data(block_arr.dtype)
dtype = make_datetimetz(unit, item['timezone'])
if _pandas_api.is_ge_v21():
pd_arr = _pandas_api.pd.array(
arr = _pandas_api.pd.array(
block_arr.view("int64"), dtype=dtype, copy=False
)
block = _int.make_block(pd_arr, placement=placement)
else:
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
arr = block_arr
if return_block:
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
return block
elif 'py_array' in item:
# create ExtensionBlock
arr = item['py_array']
Expand All @@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
if not hasattr(pandas_dtype, '__from_arrow__'):
raise ValueError("This column does not support to be converted "
"to a pandas ExtensionArray")
pd_ext_arr = pandas_dtype.__from_arrow__(arr)
block = _int.make_block(pd_ext_arr, placement=placement)
arr = pandas_dtype.__from_arrow__(arr)
else:
block = _int.make_block(block_arr, placement=placement)
arr = block_arr

return block
if return_block:
return _int.make_block(arr, placement=placement)
else:
return arr, placement


def make_datetimetz(unit, tz):
Expand All @@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
def table_to_dataframe(
options, table, categories=None, ignore_metadata=False, types_mapper=None
):
from pandas.core.internals import BlockManager
from pandas import DataFrame

all_columns = []
column_indexes = []
pandas_metadata = table.schema.pandas_metadata
Expand All @@ -774,15 +774,35 @@ def table_to_dataframe(

_check_data_column_metadata_consistency(all_columns)
columns = _deserialize_column_index(table, all_columns, column_indexes)
blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)

axes = [columns, index]
mgr = BlockManager(blocks, axes)
if _pandas_api.is_ge_v21():
df = DataFrame._from_mgr(mgr, mgr.axes)
column_names = table.column_names
result = pa.lib.table_to_blocks(options, table, categories,
list(ext_columns_dtypes.keys()))
if _pandas_api.is_ge_v3():
from pandas.api.internals import create_dataframe_from_blocks

blocks = [
_reconstruct_block(
item, column_names, ext_columns_dtypes, return_block=False)
for item in result
]
df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
return df
else:
df = DataFrame(mgr)
return df
from pandas.core.internals import BlockManager
from pandas import DataFrame

blocks = [
_reconstruct_block(item, column_names, ext_columns_dtypes)
for item in result
]
axes = [columns, index]
mgr = BlockManager(blocks, axes)
if _pandas_api.is_ge_v21():
df = DataFrame._from_mgr(mgr, mgr.axes)
else:
df = DataFrame(mgr)
return df


# Set of the string repr of all numpy dtypes that can be stored in a pandas
Expand Down Expand Up @@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
return pd.Index(new_levels[0], dtype=new_levels[0].dtype, name=columns.name)


def _table_to_blocks(options, block_table, categories, extension_columns):
# Part of table_to_blockmanager

# Convert an arrow table to Block from the internal pandas API
columns = block_table.column_names
result = pa.lib.table_to_blocks(options, block_table, categories,
list(extension_columns.keys()))
return [_reconstruct_block(item, columns, extension_columns)
for item in result]


def _add_any_metadata(table, pandas_metadata):
modified_columns = {}
modified_fields = {}
Expand Down