-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Parallel open_mfdataset #1983
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parallel open_mfdataset #1983
Changes from 10 commits
a364b22
9397aca
037fb4d
4150fd0
0e0eb7c
e703077
04212f9
b9d5eea
af4883d
80ed614
dd1c4cf
4475d13
9a5d63b
417ab5a
0115d78
e33fc15
06df0f6
5f496ad
90b8eab
280a46f
ba831b9
b0a7948
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -453,7 +453,8 @@ def close(self): | |
|
||
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, | ||
compat='no_conflicts', preprocess=None, engine=None, | ||
lock=None, data_vars='all', coords='different', **kwargs): | ||
lock=None, data_vars='all', coords='different', | ||
autoclose=False, parallel=None, **kwargs): | ||
"""Open multiple files as a single dataset. | ||
|
||
Requires dask to be installed. See documentation for details on dask [1]. | ||
|
@@ -534,7 +535,10 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, | |
those corresponding to other dimensions. | ||
* list of str: The listed coordinate variables will be concatenated, | ||
in addition the 'minimal' coordinates. | ||
|
||
parallel : bool, optional | ||
If True, the open and preprocess steps of this function will be performed | ||
in parallel using ``dask.delayed``. Default is False unless the dask's | ||
distributed scheduler is being used in which case the default is True. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you tested this with both a local system and an HPC cluster? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious about the logic of defaulting to parallel when using distributed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a better choice would be to default to true when using distributed or multi-processing? That would probably be a simpler choice. Actually, this is probably fine even with using multi-threading -- it just won't give you any noticeable speedup. So I guess we could default to |
||
**kwargs : optional | ||
Additional arguments passed on to :py:func:`xarray.open_dataset`. | ||
|
||
|
@@ -562,12 +566,26 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, | |
|
||
if lock is None: | ||
lock = _default_lock(paths[0], engine) | ||
datasets = [open_dataset(p, engine=engine, chunks=chunks or {}, lock=lock, | ||
**kwargs) for p in paths] | ||
file_objs = [ds._file_obj for ds in datasets] | ||
|
||
if preprocess is not None: | ||
datasets = [preprocess(ds) for ds in datasets] | ||
open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, | ||
autoclose=autoclose, **kwargs) | ||
|
||
if parallel or (parallel is None and get_scheduler() == 'distributed'): | ||
import dask | ||
datasets = [delayed(open_dataset)(p, **open_kwargs) for p in paths] | ||
# important: get the file_objs before calling preprocess | ||
file_objs = [ds._file_obj for ds in datasets] | ||
if preprocess is not None: | ||
datasets = [delayed(preprocess)(ds) for p in datasets] | ||
|
||
# calling compute here will return compute the datasets/file_objs lists | ||
# the underlying datasets will still be stored as dask arrays | ||
datasets, file_objs = dask.compute([datasets, file_objs]) | ||
else: | ||
datasets = [open_dataset(p, **open_kwargs) for p in paths] | ||
file_objs = [ds._file_obj for ds in datasets] | ||
if preprocess is not None: | ||
datasets = [preprocess(ds) for ds in datasets] | ||
|
||
# close datasets in case of a ValueError | ||
try: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ | |
|
||
from . import ( | ||
TestCase, assert_allclose, assert_array_equal, assert_equal, | ||
assert_identical, flaky, has_netCDF4, has_scipy, network, raises_regex, | ||
assert_identical, has_dask, has_netCDF4, has_scipy, network, raises_regex, | ||
requires_dask, requires_h5netcdf, requires_netCDF4, requires_pathlib, | ||
requires_pydap, requires_pynio, requires_rasterio, requires_scipy, | ||
requires_scipy_or_netCDF4, requires_zarr) | ||
|
@@ -1658,88 +1658,75 @@ class H5NetCDFDataTestAutocloseTrue(H5NetCDFDataTest): | |
autoclose = True | ||
|
||
|
||
class OpenMFDatasetManyFilesTest(TestCase): | ||
def validate_open_mfdataset_autoclose(self, engine, nfiles=10): | ||
randdata = np.random.randn(nfiles) | ||
original = Dataset({'foo': ('x', randdata)}) | ||
# test standard open_mfdataset approach with too many files | ||
with create_tmp_files(nfiles) as tmpfiles: | ||
for readengine in engine: | ||
writeengine = (readengine if readengine != 'pynio' | ||
else 'netcdf4') | ||
# split into multiple sets of temp files | ||
for ii in original.x.values: | ||
subds = original.isel(x=slice(ii, ii + 1)) | ||
subds.to_netcdf(tmpfiles[ii], engine=writeengine) | ||
|
||
# check that calculation on opened datasets works properly | ||
ds = open_mfdataset(tmpfiles, engine=readengine, | ||
autoclose=True) | ||
self.assertAllClose(ds.x.sum().values, | ||
(nfiles * (nfiles - 1)) / 2) | ||
self.assertAllClose(ds.foo.sum().values, np.sum(randdata)) | ||
self.assertAllClose(ds.sum().foo.values, np.sum(randdata)) | ||
ds.close() | ||
|
||
def validate_open_mfdataset_large_num_files(self, engine): | ||
self.validate_open_mfdataset_autoclose(engine, nfiles=2000) | ||
@pytest.fixture(params=['scipy', 'netcdf4', 'h5netcdf', 'pynio']) | ||
def readengine(request): | ||
return request.param | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can anyone guess as to why the pynio tests are not being properly skipped using this fixture configuration? @mrocklin - I think you suggested this approach, which I like, but something weird is happening. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should note, this is working on python 2.7 and 3.4. Just not for 3.5 or 3.6. |
||
|
||
@requires_dask | ||
@requires_netCDF4 | ||
def test_1_autoclose_netcdf4(self): | ||
self.validate_open_mfdataset_autoclose(engine=['netcdf4']) | ||
|
||
@requires_dask | ||
@requires_scipy | ||
def test_2_autoclose_scipy(self): | ||
self.validate_open_mfdataset_autoclose(engine=['scipy']) | ||
@pytest.fixture(params=[1, 10, 500]) | ||
def nfiles(request): | ||
return request.param | ||
|
||
@requires_dask | ||
@requires_pynio | ||
def test_3_autoclose_pynio(self): | ||
self.validate_open_mfdataset_autoclose(engine=['pynio']) | ||
|
||
# use of autoclose=True with h5netcdf broken because of | ||
# probable h5netcdf error | ||
@requires_dask | ||
@requires_h5netcdf | ||
@pytest.mark.xfail | ||
def test_4_autoclose_h5netcdf(self): | ||
self.validate_open_mfdataset_autoclose(engine=['h5netcdf']) | ||
@pytest.fixture(params=[True, False]) | ||
def autoclose(request): | ||
return request.param | ||
|
||
# These tests below are marked as flaky (and skipped by default) because | ||
# they fail sometimes on Travis-CI, for no clear reason. | ||
|
||
@requires_dask | ||
@requires_netCDF4 | ||
@flaky | ||
@pytest.mark.slow | ||
def test_1_open_large_num_files_netcdf4(self): | ||
self.validate_open_mfdataset_large_num_files(engine=['netcdf4']) | ||
@pytest.fixture(params=[True, False]) | ||
def parallel(request): | ||
return request.param | ||
|
||
@requires_dask | ||
@requires_scipy | ||
@flaky | ||
@pytest.mark.slow | ||
def test_2_open_large_num_files_scipy(self): | ||
self.validate_open_mfdataset_large_num_files(engine=['scipy']) | ||
|
||
@requires_dask | ||
@requires_pynio | ||
@flaky | ||
@pytest.mark.slow | ||
def test_3_open_large_num_files_pynio(self): | ||
self.validate_open_mfdataset_large_num_files(engine=['pynio']) | ||
|
||
# use of autoclose=True with h5netcdf broken because of | ||
# probable h5netcdf error | ||
@requires_dask | ||
@requires_h5netcdf | ||
@flaky | ||
@pytest.mark.xfail | ||
@pytest.mark.slow | ||
def test_4_open_large_num_files_h5netcdf(self): | ||
self.validate_open_mfdataset_large_num_files(engine=['h5netcdf']) | ||
@pytest.fixture(params=[None, 5]) | ||
def chunks(request): | ||
return request.param | ||
|
||
|
||
def skip_if_not_engine(engine): | ||
if engine == 'netcdf4': | ||
pytest.importorskip('netCDF4') | ||
elif engine == 'pynio': | ||
pytest.importorskip('Nio') | ||
else: | ||
pytest.importorskip(engine) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @maxim-lian - do you know a better way to skip parameterized tests based on some condition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might include the engine as a fixture. This is how we do it with Parquet in dask.dataframe @pytest.fixture(params=[pytest.mark.skipif(not fastparquet, 'fastparquet',
reason='fastparquet not found'),
pytest.mark.skipif(not pq, 'pyarrow',
reason='pyarrow not found')])
def engine(request):
return request.param |
||
|
||
|
||
def test_open_mfdataset_manyfiles(readengine, nfiles, autoclose, parallel, | ||
chunks): | ||
|
||
# skip certain combinations | ||
skip_if_not_engine(readengine) | ||
|
||
if not has_dask and parallel: | ||
pytest.skip('parallel requires dask') | ||
|
||
if readengine == 'h5netcdf' and autoclose: | ||
pytest.skip('h5netcdf does not support autoclose yet') | ||
|
||
if ON_WINDOWS: | ||
pytest.skip('Skipping on Windows') | ||
|
||
randdata = np.random.randn(nfiles) | ||
original = Dataset({'foo': ('x', randdata)}) | ||
# test standard open_mfdataset approach with too many files | ||
with create_tmp_files(nfiles) as tmpfiles: | ||
writeengine = (readengine if readengine != 'pynio' | ||
else 'netcdf4') | ||
# split into multiple sets of temp files | ||
for ii in original.x.values: | ||
subds = original.isel(x=slice(ii, ii + 1)) | ||
subds.to_netcdf(tmpfiles[ii], engine=writeengine) | ||
|
||
# check that calculation on opened datasets works properly | ||
actual = open_mfdataset(tmpfiles, engine=readengine, | ||
autoclose=autoclose, | ||
chunks=chunks) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't you want to have a test that explicitly calls There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice catch :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice catch :) |
||
|
||
# check that using open_mfdataset returns dask arrays for variables | ||
assert isinstance(actual['foo'].data, dask_array_type) | ||
|
||
assert_identical(original, actual) | ||
|
||
|
||
@requires_scipy_or_netCDF4 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This item ended up duplicated when you merged in master.