Skip to content

Add pathlib.Path support to open_(mf)dataset #1514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Sep 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cb55c45
Add pathlib support
willirath Aug 21, 2017
f9922d6
Loop over tmpfile functions
willirath Aug 21, 2017
02023ed
Added show_commit_url to asv.conf (#1515)
TomAugspurger Aug 23, 2017
4276bb8
Small documentation fixes (#1516)
leezu Aug 25, 2017
812a483
Condense pathlib handling for open_mf_dataset
willirath Aug 25, 2017
47be4b7
Add and test pathlib support for backends
willirath Aug 25, 2017
aac0760
Add pathlib2 for python < 3
willirath Aug 25, 2017
3ca8c9e
Use pathlib backport if available.
willirath Aug 25, 2017
aae32a8
Use pathlib w DataArray.to_netcdf
willirath Aug 25, 2017
2cc69f4
Handle case of completely missing pathlib
willirath Aug 25, 2017
3033433
Remove pathlib requirement
willirath Aug 25, 2017
c8722db
Drop pathlib from minimal test env
willirath Aug 25, 2017
aeed776
Add what's-new entry on pathlib support
willirath Aug 25, 2017
137dff2
Prefer stdlib pathlib
willirath Aug 25, 2017
b55b013
Suppress ImportError's for pathlib
willirath Aug 25, 2017
422615f
Acutally get suppress function
willirath Aug 25, 2017
8c9ee31
Add decorator for tests requiring pathlib(2)
willirath Aug 26, 2017
f3dbf4b
Move path_type to central submodule
willirath Aug 26, 2017
efdc883
Remove unnecessary parens
willirath Aug 26, 2017
999d21d
Revert "Added show_commit_url to asv.conf (#1515)"
willirath Aug 26, 2017
04216f1
Revert "Small documentation fixes (#1516)"
willirath Aug 26, 2017
ce156a8
Fix typo in docstring and fallback-module name
willirath Aug 26, 2017
b22a389
Tweak what's new for pathlib support
shoyer Aug 31, 2017
791ba5b
Merge branch 'master' into 799-add-pathlib-support-2
shoyer Sep 1, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/requirements-py27-cdat+pynio.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- netcdf4
- numpy
- pandas
- pathlib2
- pynio
- pytest
- scipy
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-py27-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- h5netcdf
- matplotlib
- netcdf4
- pathlib2
- pytest
- numpy
- pandas
Expand Down
22 changes: 22 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,31 @@ Backward Incompatible Changes
Enhancements
~~~~~~~~~~~~

- Support for `pathlib.Path` objects added to
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: we typically cite the issue number (e.g. :issue: 799:). Would be nice to include here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just pushed a commit to add this

:py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
:py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset`
(:issue:`799`):

.. ipython::
:verbatim:

In [2]: from pathlib import Path # In Python 2, use pathlib2!

In [3]: data_dir = Path("data/")

In [4]: one_file = data_dir / "dta_for_month_01.nc"

In [5]: xr.open_dataset(one_file)
Out[5]:
<xarray.Dataset>
[...]

By `Willi Rath <https://github.com/willirath>`_.

- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
raster files are opened with :py:func:`~xarray.open_rasterio`.
By `Greg Brener <https://github.com/gbrener>`_

- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
a signed integer data type should be interpreted as unsigned bytes
(:issue:`1444`).
Expand Down
44 changes: 28 additions & 16 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
from io import BytesIO
from numbers import Number


import numpy as np

from .. import backends, conventions
from .common import ArrayWriter, GLOBAL_LOCK
from ..core import indexing
from ..core.combine import auto_combine
from ..core.utils import close_on_error, is_remote_uri
from ..core.pycompat import basestring
from ..core.pycompat import basestring, path_type

DATAARRAY_NAME = '__xarray_dataarray_name__'
DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
Expand Down Expand Up @@ -139,12 +140,12 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,

Parameters
----------
filename_or_obj : str, file or xarray.backends.*DataStore
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
and opened with python-netCDF4, unless the filename ends with .gz, in
which case the file is gunzipped and opened with scipy.io.netcdf (only
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
(only netCDF3 supported).
filename_or_obj : str, Path, file or xarray.backends.*DataStore
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -253,6 +254,9 @@ def maybe_decode_store(store, lock=False):

return ds2

if isinstance(filename_or_obj, path_type):
filename_or_obj = str(filename_or_obj)

if isinstance(filename_or_obj, backends.AbstractDataStore):
store = filename_or_obj
elif isinstance(filename_or_obj, basestring):
Expand Down Expand Up @@ -318,12 +322,12 @@ def open_dataarray(*args, **kwargs):

Parameters
----------
filename_or_obj : str, file or xarray.backends.*DataStore
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
and opened with python-netCDF4, unless the filename ends with .gz, in
which case the file is gunzipped and opened with scipy.io.netcdf (only
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
(only netCDF3 supported).
filename_or_obj : str, Path, file or xarray.backends.*DataStore
Strings and Paths are interpreted as a path to a netCDF file or an
OpenDAP URL and opened with python-netCDF4, unless the filename ends
with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -438,7 +442,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
----------
paths : str or sequence
Either a string glob in the form "path/to/my/files/*.nc" or an explicit
list of files to open.
list of files to open. Paths can be given as strings or as pathlib
Paths.
chunks : int or dict, optional
Dictionary with keys given by dimension names and values given by chunk
sizes. In general, these should divide the dimensions of each dataset.
Expand Down Expand Up @@ -497,6 +502,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
"""
if isinstance(paths, basestring):
paths = sorted(glob(paths))
else:
paths = [str(p) if isinstance(p, path_type) else p for p in paths]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may have already discussed this with @shoyer but can you remind me why we're not sorting in the same way we do for the glob path above? I guess we're assuming all the paths are expanded already?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We sort after glob() since the iteration order in arbitrary. But we don't sort in general, since the order of the provided filenames might be intentional.

Unfortunately, there isn't any way to detect a generator created by pathlib's glob() method, since it's just a Python generator.


if not paths:
raise IOError('no files to open')

Expand Down Expand Up @@ -533,6 +541,8 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,

The ``writer`` argument is only for the private use of save_mfdataset.
"""
if isinstance(path_or_file, path_type):
path_or_file = str(path_or_file)
if encoding is None:
encoding = {}
if path_or_file is None:
Expand Down Expand Up @@ -597,12 +607,14 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
----------
datasets : list of xarray.Dataset
List of datasets to save.
paths : list of str
paths : list of str or list of Paths
List of paths to which to save each corresponding dataset.
mode : {'w', 'a'}, optional
Write ('w') or append ('a') mode. If mode='w', any existing file at
these locations will be overwritten.
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
'NETCDF3_CLASSIC'}, optional

File format for the resulting netCDF file:

* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
Expand Down
8 changes: 5 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,15 +1286,16 @@ def to_netcdf(self, *args, **kwargs):

Parameters
----------
path : str, optional
path : str or Path, optional
Path to which to save this dataset. If no path is provided, this
function returns the resulting netCDF file as a bytes object; in
this case, we need to use scipy.io.netcdf, which does not support
netCDF version 4 (the default format becomes NETCDF3_64BIT).
mode : {'w', 'a'}, optional
Write ('w') or append ('a') mode. If mode='w', any existing file at
this location will be overwritten.
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
'NETCDF3_CLASSIC'}, optional
File format for the resulting netCDF file:

* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
Expand Down Expand Up @@ -1324,7 +1325,8 @@ def to_netcdf(self, *args, **kwargs):
encoding : dict, optional
Nested dictionary with variable names as keys and dictionaries of
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
'zlib': True}, ...}``

Notes
-----
Expand Down
5 changes: 3 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,

Parameters
----------
path : str or file-like object, optional
path : str, Path or file-like object, optional
Path to which to save this dataset. File-like objects are only
supported by the scipy engine. If no path is provided, this
function returns the resulting netCDF file as bytes; in this case,
Expand Down Expand Up @@ -963,7 +963,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
encoding : dict, optional
Nested dictionary with variable names as keys and dictionaries of
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
'zlib': True}, ...}``
unlimited_dims : sequence of str, optional
Dimension(s) that should be serialized as unlimited dimensions.
By default, no dimensions are treated as unlimited dimensions.
Expand Down
12 changes: 11 additions & 1 deletion xarray/core/pycompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ def itervalues(d):
except ImportError: # pragma: no cover
dask_array_type = ()

try:
try:
from pathlib import Path
except ImportError as e:
from pathlib2 import Path
path_type = (Path, )
except ImportError as e:
path_type = ()


try:
from contextlib import suppress
except ImportError:
Expand Down Expand Up @@ -188,7 +198,7 @@ def __exit__(self, *exc_details):
# We manipulate the exception state so it behaves as though
# we were actually nesting multiple with statements
frame_exc = sys.exc_info()[1]

def _fix_exception_context(new_exc, old_exc):
# Context may not be correct, so find the end of the chain
while 1:
Expand Down
14 changes: 14 additions & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@
except ImportError:
has_rasterio = False

try:
import pathlib
has_pathlib = True
except ImportError:
try:
import pathlib2
has_pathlib = True
except ImportError:
has_pathlib = False


# slighly simpler construction that the full functions.
# Generally `pytest.importorskip('package')` inline is even easier
requires_matplotlib = pytest.mark.skipif(
Expand All @@ -105,6 +116,9 @@
not has_bottleneck, reason='requires bottleneck')
requires_rasterio = pytest.mark.skipif(
not has_rasterio, reason='requires rasterio')
requires_pathlib = pytest.mark.skipif(
not has_pathlib, reason='requires pathlib / pathlib2'
)


try:
Expand Down
57 changes: 53 additions & 4 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@

from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap,
requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf,
requires_pynio, has_netCDF4, has_scipy, assert_allclose,
flaky, network, requires_rasterio, assert_identical)
requires_pynio, requires_pathlib, has_netCDF4, has_scipy,
assert_allclose, flaky, network, requires_rasterio,
assert_identical)
from .test_dataset import create_test_data

try:
Expand All @@ -40,6 +41,14 @@
except ImportError:
pass

try:
from pathlib import Path
except ImportError:
try:
from pathlib2 import Path
except ImportError:
pass


ON_WINDOWS = sys.platform == 'win32'

Expand Down Expand Up @@ -302,7 +311,8 @@ def test_roundtrip_timedelta_data(self):
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_float64_data(self):
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], dtype='float64'))})
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi],
dtype='float64'))})
with self.roundtrip(expected) as actual:
self.assertDatasetIdentical(expected, actual)

Expand Down Expand Up @@ -738,7 +748,8 @@ def test_mask_and_scale(self):
v.scale_factor = 0.1
v[:] = np.array([-1, -1, 0, 1, 2])

# first make sure netCDF4 reads the masked and scaled data correctly
# first make sure netCDF4 reads the masked and scaled data
# correctly
with nc4.Dataset(tmp_file, mode='r') as nc:
expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
mask=[True, True, False, False, False])
Expand Down Expand Up @@ -1305,6 +1316,19 @@ def test_open_mfdataset(self):
with self.assertRaisesRegexp(IOError, 'no files to open'):
open_mfdataset('foo-bar-baz-*.nc', autoclose=self.autoclose)

@requires_pathlib
def test_open_mfdataset_pathlib(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
tmp1 = Path(tmp1)
tmp2 = Path(tmp2)
original.isel(x=slice(5)).to_netcdf(tmp1)
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2],
autoclose=self.autoclose) as actual:
self.assertDatasetAllClose(original, actual)

def test_attrs_mfdataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp1:
Expand Down Expand Up @@ -1355,6 +1379,20 @@ def test_save_mfdataset_invalid(self):
with self.assertRaisesRegexp(ValueError, 'same length'):
save_mfdataset([ds, ds], ['only one path'])

@requires_pathlib
def test_save_mfdataset_pathlib_roundtrip(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
datasets = [original.isel(x=slice(5)),
original.isel(x=slice(5, 10))]
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
tmp1 = Path(tmp1)
tmp2 = Path(tmp2)
save_mfdataset(datasets, [tmp1, tmp2])
with open_mfdataset([tmp1, tmp2],
autoclose=self.autoclose) as actual:
self.assertDatasetIdentical(actual, original)

def test_open_and_do_math(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp:
Expand Down Expand Up @@ -1946,3 +1984,14 @@ def test_open_dataarray_options(self):
expected = data.drop('y')
with open_dataarray(tmp, drop_variables=['y']) as loaded:
self.assertDataArrayIdentical(expected, loaded)

@requires_pathlib
def test_dataarray_to_netcdf_no_name_pathlib(self):
original_da = DataArray(np.arange(12).reshape((3, 4)))

with create_tmp_file() as tmp:
tmp = Path(tmp)
original_da.to_netcdf(tmp)

with open_dataarray(tmp) as loaded_da:
self.assertDataArrayIdentical(original_da, loaded_da)