Skip to content

BUG: GH29310 HDF file compression not working #29404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pandas/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def assert_categorical_equal(
if check_category_order:
assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories")
assert_numpy_array_equal(
left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes",
left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes"
)
else:
assert_index_equal(
Expand Down Expand Up @@ -982,7 +982,7 @@ def _raise(left, right, err_msg):
if err_msg is None:
if left.shape != right.shape:
raise_assert_detail(
obj, f"{obj} shapes are different", left.shape, right.shape,
obj, f"{obj} shapes are different", left.shape, right.shape
)

diff = 0
Expand Down Expand Up @@ -1326,7 +1326,7 @@ def assert_frame_equal(
# shape comparison
if left.shape != right.shape:
raise_assert_detail(
obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}",
obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}"
)

if check_like:
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ def __iter__(self):
def __len__(self) -> int:
return len(self._data)

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
):
def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default):
"""
Convert to a NumPy Array.

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ def _sub_period(self, other):
return new_data

def _addsub_int_array(
self, other: np.ndarray, op: Callable[[Any, Any], Any],
self, other: np.ndarray, op: Callable[[Any, Any], Any]
) -> "PeriodArray":
"""
Add or subtract array of integers; equivalent to applying
Expand Down
8 changes: 1 addition & 7 deletions pandas/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,4 @@
from pandas.core.groupby.groupby import GroupBy
from pandas.core.groupby.grouper import Grouper

__all__ = [
"DataFrameGroupBy",
"NamedAgg",
"SeriesGroupBy",
"GroupBy",
"Grouper",
]
__all__ = ["DataFrameGroupBy", "NamedAgg", "SeriesGroupBy", "GroupBy", "Grouper"]
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def _outer_indexer(self, left, right):
# Constructors

def __new__(
cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs,
cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs
) -> "Index":

from pandas.core.indexes.range import RangeIndex
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class RangeIndex(Int64Index):
# Constructors

def __new__(
cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None,
cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None
):

cls._validate_dtype(dtype)
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin):
typ="method",
overwrite=True,
)
class TimedeltaIndex(
DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin,
):
class TimedeltaIndex(DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin):
"""
Immutable ndarray of timedelta64 data, represented internally as int64, and
which can be boxed to timedelta objects.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1282,7 +1282,7 @@ def _zero_out_fperr(arg):

@disallow("M8", "m8")
def nancorr(
a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None,
a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None
):
"""
a, b: ndarrays
Expand Down
9 changes: 1 addition & 8 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,7 @@
}


COMPARISON_BINOPS: Set[str] = {
"eq",
"ne",
"lt",
"gt",
"le",
"ge",
}
COMPARISON_BINOPS: Set[str] = {"eq", "ne", "lt", "gt", "le", "ge"}

# -----------------------------------------------------------------------------
# Ops Wrapping Utilities
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/ops/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def should_series_dispatch(left, right, op):


def dispatch_to_extension_op(
op, left: Union[ABCExtensionArray, np.ndarray], right: Any,
op, left: Union[ABCExtensionArray, np.ndarray], right: Any
):
"""
Assume that left or right is a Series backed by an ExtensionArray,
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def _add_margins(

elif values:
marginal_result_set = _generate_marginal_results(
table, data, values, rows, cols, aggfunc, observed, margins_name,
table, data, values, rows, cols, aggfunc, observed, margins_name
)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
Expand Down Expand Up @@ -295,7 +295,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"):


def _generate_marginal_results(
table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All",
table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All"
):
if len(cols) > 0:
# need to "interleave" the margins
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/window/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class BaseIndexer:
"""Base class for window bounds calculations."""

def __init__(
self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs,
self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs
):
"""
Parameters
Expand Down Expand Up @@ -100,7 +100,7 @@ def get_window_bounds(
) -> Tuple[np.ndarray, np.ndarray]:

return calculate_variable_window_bounds(
num_values, self.window_size, min_periods, center, closed, self.index_array,
num_values, self.window_size, min_periods, center, closed, self.index_array
)


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/window/numba_.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def impl(window, *_args):

@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def roll_apply(
values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int,
values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int
) -> np.ndarray:
result = np.empty(len(begin))
for i in loop_range(len(result)):
Expand Down
9 changes: 2 additions & 7 deletions pandas/io/formats/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ def expand(self, prop, value: str):
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn(
f'Could not expand "{prop}: {value}"', CSSWarning,
)
warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning)
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
Expand Down Expand Up @@ -117,10 +115,7 @@ def __call__(self, declarations_str, inherited=None):
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS
)
for prop in [
f"margin-{side}",
f"padding-{side}",
]:
for prop in [f"margin-{side}", f"padding-{side}"]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
Expand Down
3 changes: 1 addition & 2 deletions pandas/io/formats/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]:
return {
side: {
"style": self._border_style(
props.get(f"border-{side}-style"),
props.get(f"border-{side}-width"),
props.get(f"border-{side}-style"), props.get(f"border-{side}-width")
),
"color": self.color_to_excel(props.get(f"border-{side}-color")),
}
Expand Down
17 changes: 3 additions & 14 deletions pandas/io/formats/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,7 @@ def format_attr(pair):

# ... except maybe the last for columns.names
name = self.data.columns.names[r]
cs = [
BLANK_CLASS if name is None else INDEX_NAME_CLASS,
f"level{r}",
]
cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, f"level{r}"]
name = BLANK_VALUE if name is None else name
row_es.append(
{
Expand All @@ -320,11 +317,7 @@ def format_attr(pair):

if clabels:
for c, value in enumerate(clabels[r]):
cs = [
COL_HEADING_CLASS,
f"level{r}",
f"col{c}",
]
cs = [COL_HEADING_CLASS, f"level{r}", f"col{c}"]
cs.extend(
cell_context.get("col_headings", {}).get(r, {}).get(c, [])
)
Expand Down Expand Up @@ -368,11 +361,7 @@ def format_attr(pair):
for r, idx in enumerate(self.data.index):
row_es = []
for c, value in enumerate(rlabels[r]):
rid = [
ROW_HEADING_CLASS,
f"level{c}",
f"row{r}",
]
rid = [ROW_HEADING_CLASS, f"level{c}", f"row{r}"]
es = {
"type": "th",
"is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index),
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def read_orc(
path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs
) -> "DataFrame":
"""
Load an ORC object from the file path, returning a DataFrame.
Expand Down
48 changes: 35 additions & 13 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def to_hdf(
min_itemsize=min_itemsize,
nan_rep=nan_rep,
data_columns=data_columns,
dropna=dropna,
errors=errors,
encoding=encoding,
)
Expand Down Expand Up @@ -995,6 +996,7 @@ def put(
min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
nan_rep=None,
data_columns: Optional[List[str]] = None,
dropna: Optional[bool] = False,
encoding=None,
errors: str = "strict",
):
Expand All @@ -1015,14 +1017,25 @@ def put(
append : bool, default False
This will force Table format, append the input data to the
existing.
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 or None disables compression.
dropna : bool, default False, do not write an ALL nan row to
The store settable by the option 'io.hdf.dropna_table'.
data_columns : list, default None
List of columns to create as data columns, or True to
use all columns. See `here
<https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
encoding : str, default None
Provide an encoding for strings.
dropna : bool, default False, do not write an ALL nan row to
The store settable by the option 'io.hdf.dropna_table'.
"""
if format is None:
format = get_option("io.hdf.default_format") or "fixed"
Expand All @@ -1037,6 +1050,7 @@ def put(
complevel=complevel,
min_itemsize=min_itemsize,
nan_rep=nan_rep,
dropna=dropna,
data_columns=data_columns,
encoding=encoding,
errors=errors,
Expand Down Expand Up @@ -1147,6 +1161,17 @@ def append(
dropna : bool, default False
Do not write an ALL nan row to the store settable
by the option 'io.hdf.dropna_table'.
complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 or None disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.

Notes
-----
Expand Down Expand Up @@ -2836,7 +2861,7 @@ def read_index_node(
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we relace it with the original.
if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,)
data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
kind = _ensure_decoded(node._v_attrs.kind)
name = None

Expand Down Expand Up @@ -3578,10 +3603,7 @@ def _read_axes(
for a in self.axes:
a.set_info(self.info)
res = a.convert(
values,
nan_rep=self.nan_rep,
encoding=self.encoding,
errors=self.errors,
values, nan_rep=self.nan_rep, encoding=self.encoding, errors=self.errors
)
results.append(res)

Expand Down Expand Up @@ -4007,7 +4029,7 @@ def create_description(
return d

def read_coordinates(
self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
self, where=None, start: Optional[int] = None, stop: Optional[int] = None
):
"""select coordinates (row numbers) from a table; return the
coordinates object
Expand Down Expand Up @@ -4274,7 +4296,7 @@ def write_data_chunk(
self.table.flush()

def delete(
self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
self, where=None, start: Optional[int] = None, stop: Optional[int] = None
):

# delete all rows (and return the nrows)
Expand Down Expand Up @@ -4452,7 +4474,7 @@ def is_transposed(self) -> bool:
def get_object(cls, obj, transposed: bool):
return obj

def write(self, obj, data_columns=None, **kwargs):
def write(self, obj, data_columns=None, dropna=None, **kwargs):
""" we are going to write this as a frame table """
if not isinstance(obj, DataFrame):
name = obj.name or "values"
Expand Down Expand Up @@ -4705,7 +4727,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index
if inferred_type == "date":
converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
return IndexCol(
name, converted, "date", _tables().Time32Col(), index_name=index_name,
name, converted, "date", _tables().Time32Col(), index_name=index_name
)
elif inferred_type == "string":

Expand All @@ -4721,13 +4743,13 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index

elif inferred_type in ["integer", "floating"]:
return IndexCol(
name, values=converted, kind=kind, typ=atom, index_name=index_name,
name, values=converted, kind=kind, typ=atom, index_name=index_name
)
else:
assert isinstance(converted, np.ndarray) and converted.dtype == object
assert kind == "object", kind
atom = _tables().ObjectAtom()
return IndexCol(name, converted, kind, atom, index_name=index_name,)
return IndexCol(name, converted, kind, atom, index_name=index_name)


def _unconvert_index(
Expand Down
Loading