-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Fixing GH1434 xr.concat loses coordinate dtype information with recarrays in 0.9 #1438
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
f9c51b7
6635f9c
ee3a23e
16a7c37
96af68c
b8beca4
c0b97b1
693d019
a5d6ade
555cc02
72f5f24
a245586
7cec75c
66f20e3
6040c07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,10 @@ def align(*objects, **kwargs): | |
for dim in obj.dims: | ||
if dim not in exclude: | ||
try: | ||
# GH1434 | ||
# dtype is lost after obj.indexes[dim] | ||
# problem originates in Indexes.__getitem__ | ||
# in coordinates.py | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why you're adding these comments here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed those |
||
index = obj.indexes[dim] | ||
except KeyError: | ||
unlabeled_dim_sizes[dim].add(obj.sizes[dim]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,8 +207,29 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): | |
|
||
dim, coord = _calc_concat_dim_coord(dim) | ||
datasets = [as_dataset(ds) for ds in datasets] | ||
|
||
# GH1434 | ||
# constructing a dictionary that will be used to preserve dtype | ||
# of the original dataset dimensions | ||
dtype_dict = {} | ||
for ds in datasets: | ||
for dim_tuple in ds.dims.items(): | ||
dim_name = dim_tuple[0] | ||
if dim_name != dim: | ||
dtype_dict[dim_name] = ds[dim_name].dtype | ||
|
||
# align loses original dtype of the datasets' dim variables | ||
datasets = align(*datasets, join='outer', copy=False, exclude=[dim]) | ||
|
||
# GH1434 | ||
# restoring original dtype of the datasets' dimensions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fix should either be done in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the pointer. It was indeed the place where loss of dtype happened |
||
for ds in datasets: | ||
for dim_name, dim_dtype in dtype_dict.items(): | ||
try: | ||
ds[dim_name] = ds[dim_name].astype(dtype_dict[dim_name]) | ||
except KeyError: | ||
pass | ||
|
||
concat_over = _calc_concat_over(datasets, dim, data_vars, coords) | ||
|
||
def insert_result_variable(k, v): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1212,35 +1212,94 @@ def __setitem__(self, key, value): | |
raise TypeError('%s values cannot be modified' % type(self).__name__) | ||
|
||
@classmethod | ||
def concat(cls, variables, dim='concat_dim', positions=None, | ||
shortcut=False): | ||
"""Specialized version of Variable.concat for IndexVariable objects. | ||
|
||
This exists because we want to avoid converting Index objects to NumPy | ||
arrays, if possible. | ||
def concat_numpy(cls, variables, positions=None): | ||
""" | ||
if not isinstance(dim, basestring): | ||
dim, = dim.dims | ||
Concatenates variables. Works for variables whose dtype is | ||
different from numpy.object. If variables' dtype is numpy.object | ||
it throws TypeError and "concat" function will use | ||
concat_pandas function | ||
:param variables: list of variables to concatenate | ||
:return: Concatenated variables | ||
""" | ||
variable_type_set = set(map(lambda v: type(v.data), variables)) | ||
|
||
variables = list(variables) | ||
first_var = variables[0] | ||
if len(variable_type_set) > 1: | ||
raise TypeError('Trying to concatenate variables of ' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These extra checks would break some things that currently works with |
||
'different types') | ||
|
||
if any(not isinstance(v, cls) for v in variables): | ||
raise TypeError('IndexVariable.concat requires that all input ' | ||
'variables be IndexVariable objects') | ||
variable_type = list(variable_type_set)[0] | ||
if not variable_type == np.ndarray: | ||
raise TypeError('Can only concatenate variables whose ' | ||
'_data member is ndarray') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already guaranteed by xarray's data model, so this should either be an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
|
||
if variables[0].dtype == np.object: | ||
raise TypeError('We use concat_numpy for objects whose ' | ||
'dtypes are different than numpy.object ') | ||
|
||
indexes = [v._data for v in variables] | ||
|
||
if not indexes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need this special branch for numpy. Literally There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
data = [] | ||
else: | ||
data = np.concatenate((indexes)) | ||
|
||
if positions is not None: | ||
indices = nputils.inverse_permutation( | ||
np.concatenate(positions)) | ||
data = data.take(indices) | ||
|
||
return data | ||
|
||
@classmethod | ||
def concat_pandas(cls, variables, positions=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you keep these methods already, they should be private (preface with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
""" | ||
Concatenates variables. This is generic function that | ||
handles all cases for which concat_numpy does not work | ||
:param variables: list of variables to concatenate | ||
:return: Concatenated variables | ||
""" | ||
indexes = [v._data.array for v in variables] | ||
|
||
if not indexes: | ||
data = [] | ||
else: | ||
|
||
data = indexes[0].append(indexes[1:]) | ||
|
||
if positions is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block is the same for both |
||
indices = nputils.inverse_permutation( | ||
np.concatenate(positions)) | ||
data = data.take(indices) | ||
|
||
return data | ||
|
||
@classmethod | ||
def concat(cls, variables, dim='concat_dim', positions=None, | ||
shortcut=False): | ||
"""Specialized version of Variable.concat for | ||
IndexVariable objects. | ||
This exists because we want to avoid converting Index objects to NumPy | ||
arrays, if possible. | ||
""" | ||
if not isinstance(dim, basestring): | ||
dim, = dim.dims | ||
|
||
variables = list(variables) | ||
|
||
first_var = variables[0] | ||
|
||
if any(not isinstance(v, cls) for v in variables): | ||
raise TypeError('IndexVariable.concat requires that all input ' | ||
'variables be IndexVariable objects') | ||
|
||
# GH1434 | ||
# Fixes bug: "xr.concat loses coordinate dtype | ||
# information with recarrays in 0.9" | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than using exceptions for control flow, can we divide this into two cases based upon whether any (all?) of the variables have dtype=object? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
data = cls.concat_numpy(variables, positions) | ||
except TypeError: | ||
data = cls.concat_pandas(variables, positions) | ||
|
||
attrs = OrderedDict(first_var.attrs) | ||
if not shortcut: | ||
for var in variables: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,7 +101,7 @@ | |
try: | ||
_SKIP_FLAKY = not pytest.config.getoption("--run-flaky") | ||
_SKIP_NETWORK_TESTS = not pytest.config.getoption("--run-network-tests") | ||
except ValueError: | ||
except (ValueError, AttributeError) as e: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did Even if so, we don't need |
||
# Can't get config from pytest, e.g., because xarray is installed instead | ||
# of being run from a development version (and hence conftests.py is not | ||
# available). Don't run flaky tests. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a different fix?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed