diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3e6d00a8059..8ace01daed5 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -403,6 +403,7 @@ def open_dataset( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -494,6 +495,10 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + keep_variables: str or iterable of str, optional + A variable or list of variables to load from the dataset. This is + useful if you don't need all the variables in the file and don't + want to spend time loading them. Default is to load all variables. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -572,6 +577,7 @@ def open_dataset( backend_ds = backend.open_dataset( filename_or_obj, drop_variables=drop_variables, + keep_variables=keep_variables, **decoders, **kwargs, ) @@ -586,6 +592,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + keep_variables=keep_variables, **decoders, **kwargs, ) @@ -606,6 +613,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -699,6 +707,10 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + keep_variables: str or iterable of str, optional + A variable or list of variables to load from the dataset. This is + useful if you don't need all the variables in the file and don't + want to spend time loading them. Default is to load all variables. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -756,6 +768,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + keep_variables=keep_variables, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 5b8f9a6840f..c681d699588 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -452,8 +452,8 @@ class BackendEntrypoint: - ``open_dataset`` method: it shall implement reading from file, variables decoding and it returns an instance of :py:class:`~xarray.Dataset`. - It shall take in input at least ``filename_or_obj`` argument and - ``drop_variables`` keyword argument. + It shall take in input at least ``filename_or_obj`` argument, + ``keep_variables`` argument, and ``drop_variables`` keyword argument. For more details see :ref:`RST open_dataset`. - ``guess_can_open`` method: it shall return ``True`` if the backend is able to open ``filename_or_obj``, ``False`` otherwise. The implementation of this @@ -490,6 +490,7 @@ def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, **kwargs: Any, ) -> Dataset: """ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index a68a44b5f6f..3358f840a68 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -394,6 +394,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, format=None, @@ -427,6 +428,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index f21f15bf795..dcc20601774 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -588,6 +588,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, group=None, @@ -621,6 +622,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 9b5bcc82e6f..73f1b023833 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -179,6 +179,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, application=None, @@ -207,6 +208,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 75e96ffdc0a..db2235fbb2f 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -134,6 +134,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, mode="r", @@ -155,6 +156,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 1ecc70cf376..ff4b0667d28 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -297,6 +297,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, mode="r", @@ -319,6 +320,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index a507ee37470..db566b038e2 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -35,6 +35,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -51,6 +52,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6632e40cf6f..13a76c65ca7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -775,6 +775,7 @@ def open_zarr( concat_characters=True, decode_coords=True, drop_variables=None, + keep_variables=None, consolidated=None, overwrite_encoded_chunks=False, chunk_store=None, @@ -836,6 +837,10 @@ def open_zarr( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + keep_variables: str or iterable of str, optional + A variable or list of variables to load from the dataset. This is + useful if you don't need all the variables in the file and don't + want to spend time loading them. Default is to load all variables. consolidated : bool, optional Whether to open the store using zarr's consolidated metadata capability. Only works for stores that have already been consolidated. @@ -933,6 +938,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + keep_variables=keep_variables, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, @@ -977,6 +983,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + keep_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, group=None, @@ -1011,6 +1018,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/conventions.py b/xarray/conventions.py index cf207f0c37a..1bfaa8d8833 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -45,6 +45,7 @@ T_Variables = Mapping[Any, Variable] T_Attrs = MutableMapping[Any, Any] T_DropVariables = Union[str, Iterable[Hashable], None] + T_KeepVariables = Union[str, Iterable[Hashable], None] T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] @@ -380,6 +381,7 @@ def decode_cf_variables( decode_times: bool = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, + keep_variables: T_KeepVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, ) -> tuple[T_Variables, T_Attrs, set[Hashable]]: @@ -410,13 +412,19 @@ def stackable(dim: Hashable) -> bool: drop_variables = [] drop_variables = set(drop_variables) + if isinstance(keep_variables, str): + keep_variables = [keep_variables] + keep_variables = set(keep_variables) + # Time bounds coordinates might miss the decoding attributes if decode_times: _update_bounds_attributes(variables) new_vars = {} for k, v in variables.items(): - if k in drop_variables: + if k in drop_variables or ( + keep_variables is not None and k not in keep_variables + ): continue stack_char_dim = ( concat_characters @@ -496,6 +504,7 @@ def decode_cf( decode_times: bool = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, + keep_variables: T_KeepVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, ) -> Dataset: @@ -527,6 +536,10 @@ def decode_cf( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + keep_variables: str or iterable of str, optional + A variable or list of variables to load from the dataset. This is + useful if you don't need all the variables in the file and don't + want to spend time loading them. Default is to load all variables. use_cftime : bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -574,6 +587,7 @@ def decode_cf( decode_times, decode_coords, drop_variables=drop_variables, + keep_variables=keep_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index befc4cbaf04..02cd9c1427f 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -69,7 +69,7 @@ def open_dataset( class PassThroughBackendEntrypoint(xr.backends.BackendEntrypoint): """Access an object passed to the `open_dataset` method.""" - def open_dataset(self, dataset, *, drop_variables=None): + def open_dataset(self, dataset, *, drop_variables=None, keep_variables=None): """Return the first argument.""" return dataset diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index d6d1303a696..7e135b86157 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -327,6 +327,43 @@ def test_decode_cf_with_drop_variables(self) -> None: assert_identical(expected, actual) assert_identical(expected, actual2) + def test_decode_cf_with_keep_variables(self) -> None: + original = Dataset( + { + "t": ("t", [0, 1, 2], {"units": "days since 2000-01-01"}), + "x": ("x", [9, 8, 7], {"units": "km"}), + "foo": ( + ("t", "x"), + [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + {"units": "bar"}, + ), + "y": ("t", [5, 10, -999], {"_FillValue": -999}), + } + ) + expected = Dataset( + { + "t": pd.date_range("2000-01-01", periods=3), + "foo": ( + ("t", "x"), + [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + {"units": "bar"}, + ), + "y": ("t", [5, 10, np.nan]), + } + ) + expected2 = Dataset( + { + "t": pd.date_range("2000-01-01", periods=3), + } + ) + expected3 = Dataset() + actual = conventions.decode_cf(original, keep_variables=("t", "foo", "y")) + actual2 = conventions.decode_cf(original, keep_variables="t") + actual3 = conventions.decode_cf(original, keep_variables=[]) + assert_identical(expected, actual) + assert_identical(expected2, actual2) + assert_identical(expected3, actual3) + @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") def test_invalid_time_units_raises_eagerly(self) -> None: ds = Dataset({"time": ("time", [0, 1], {"units": "foobar since 123"})})