diff --git a/README.md b/README.md index b6ca88b..8e1c2af 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ It's a good idea to use `conda` to handle installing rasterio on Windows. There' * Transfer the STAC metadata into [xarray coordinates](http://xarray.pydata.org/en/stable/data-structures.html#coordinates) for easy indexing, filtering, and provenance of metadata. * Efficiently generate a Dask graph for loading the data in parallel. * Mediate between Dask's parallelism and GDAL's aversion to it, allowing for fast, multi-threaded reads when possible, and at least preventing segfaults when not. -* Mask nodata and rescale by dataset-level scales/offsets. +* Mask nodata and rescale by STAC item asset scales/offsets. * Display data in interactive maps in a notebook, computed on the fly by Dask. ## Limitations: diff --git a/stackstac/prepare.py b/stackstac/prepare.py index 2f2bcf7..d9f68a0 100644 --- a/stackstac/prepare.py +++ b/stackstac/prepare.py @@ -23,10 +23,11 @@ import xarray as xr from .raster_spec import IntFloat, Bbox, Resolutions, RasterSpec + from .stac_types import ItemSequence from . import accumulate_metadata, geom_utils -ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4)]) +ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4), ("scale_offset", "float64", 2)]) class Mimetype(NamedTuple): @@ -143,6 +144,22 @@ def prepare_items( asset_bbox = asset.get("proj:bbox", item_bbox) asset_shape = asset.get("proj:shape", item_shape) asset_transform = asset.get("proj:transform", item_transform) + raster_bands = asset.get('raster:bands') + + if raster_bands is not None: + if len(raster_bands) != 1: + raise ValueError( + f"raster:bands has {len(raster_bands)} elements for asset {asset_id!r}. " + "Multi-band rasters are not currently supported.\n" + "If you don't care about this asset, you can skip it by giving a list " + "of asset IDs you *do* want in `assets=`, and leaving this one out." + ) + asset_scale = raster_bands[0].get('scale', 1) + asset_offset = raster_bands[0].get('offset', 0) + else: + asset_scale = 1 + asset_offset = 0 + asset_affine = None # Auto-compute CRS @@ -322,7 +339,7 @@ def prepare_items( continue # Phew, we figured out all the spatial stuff! Now actually store the information we care about. - asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj) + asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj, (asset_scale, asset_offset)) # ^ NOTE: If `asset_bbox_proj` is None, NumPy automatically converts it to NaNs # At this point, everything has been set (or there was as error) diff --git a/stackstac/reader_protocol.py b/stackstac/reader_protocol.py index 81f7da4..4b38fae 100644 --- a/stackstac/reader_protocol.py +++ b/stackstac/reader_protocol.py @@ -34,7 +34,7 @@ def __init__( resampling: Resampling, dtype: np.dtype, fill_value: Union[int, float], - rescale: bool, + scale_offset: Tuple[Union[int, float], Union[int, float]], gdal_env: Optional[LayeredEnv], errors_as_nodata: Tuple[Exception, ...] = (), ) -> None: @@ -55,8 +55,6 @@ def __init__( fill_value: Fill nodata pixels in the output array with this value. If None, whatever nodata value is set in the asset will be used. - rescale: - Rescale the output array according to any scales and offsets set in the asset. gdal_env: A `~.LayeredEnv` of GDAL configuration options to use while opening and reading datasets. If None (default), `~.DEFAULT_GDAL_ENV` is used. diff --git a/stackstac/rio_reader.py b/stackstac/rio_reader.py index 521c95e..1a41532 100644 --- a/stackstac/rio_reader.py +++ b/stackstac/rio_reader.py @@ -32,7 +32,6 @@ def _curthread(): # /TODO - # Default GDAL configuration options DEFAULT_GDAL_ENV = LayeredEnv( always=dict( @@ -64,11 +63,12 @@ def _curthread(): # See `ThreadLocalRioDataset` for more. # https://github.com/pangeo-data/pangeo-example-notebooks/issues/21#issuecomment-432457955 # https://gdal.org/drivers/raster/vrt.html#multi-threading-issues + MULTITHREADED_DRIVER_ALLOWLIST = {"GTiff"} class ThreadsafeRioDataset(Protocol): - scale_offset: Tuple[float, float] + scale_offset: Tuple[Union[int, float], Union[int, float]] def read(self, window: Window, **kwargs) -> np.ndarray: ... @@ -280,7 +280,7 @@ class PickleState(TypedDict): resampling: Resampling dtype: np.dtype fill_value: Union[int, float] - rescale: bool + scale_offset: Tuple[Union[int, float], Union[int, float]] gdal_env: Optional[LayeredEnv] errors_as_nodata: Tuple[Exception, ...] @@ -303,7 +303,7 @@ def __init__( resampling: Resampling, dtype: np.dtype, fill_value: Union[int, float], - rescale: bool, + scale_offset: Tuple[Union[int, float], Union[int, float]], gdal_env: Optional[LayeredEnv] = None, errors_as_nodata: Tuple[Exception, ...] = (), ) -> None: @@ -311,8 +311,8 @@ def __init__( self.spec = spec self.resampling = resampling self.dtype = dtype - self.rescale = rescale self.fill_value = fill_value + self.scale_offset = scale_offset self.gdal_env = gdal_env or DEFAULT_GDAL_ENV self.errors_as_nodata = errors_as_nodata @@ -398,12 +398,12 @@ def read(self, window: Window, **kwargs) -> np.ndarray: raise RuntimeError(msg) from e - if self.rescale: - scale, offset = reader.scale_offset - if scale != 1: - result *= scale - if offset != 0: - result += offset + scale, offset = self.scale_offset + + if scale != 1: + result *= scale + if offset != 0: + result += offset result = np.ma.filled(result, fill_value=self.fill_value) assert np.issubdtype(result.dtype, self.dtype), ( @@ -436,7 +436,7 @@ def __getstate__( "resampling": self.resampling, "dtype": self.dtype, "fill_value": self.fill_value, - "rescale": self.rescale, + "scale_offset": self.scale_offset, "gdal_env": self.gdal_env, "errors_as_nodata": self.errors_as_nodata, } diff --git a/stackstac/stack.py b/stackstac/stack.py index 13f3368..983137f 100644 --- a/stackstac/stack.py +++ b/stackstac/stack.py @@ -202,7 +202,8 @@ def stack( be represented in a smaller data type (like ``uint16``), using a different ``fill_value`` (like 0) and managing it yourself could save a lot of memory. rescale: - Whether to rescale pixel values by the scale and offset set on the dataset. + Whether to rescale pixel values by the scale and offset present in the ``raster:bands`` metadata + for each asset. Default: True. Note that this could produce floating-point data when the original values are ints, so set ``dtype`` accordingly. You will NOT be warned if the cast to ``dtype`` is losing information! diff --git a/stackstac/to_dask.py b/stackstac/to_dask.py index 517f567..280196b 100644 --- a/stackstac/to_dask.py +++ b/stackstac/to_dask.py @@ -132,6 +132,10 @@ def asset_table_to_reader_and_window( if url: asset_bounds: Bbox = asset_entry["bounds"] asset_window = windows.from_bounds(*asset_bounds, spec.transform) + if rescale: + asset_scale_offset = asset_entry["scale_offset"] + else: + asset_scale_offset = (1, 0) entry: ReaderTableEntry = ( reader( @@ -140,7 +144,7 @@ def asset_table_to_reader_and_window( resampling=resampling, dtype=dtype, fill_value=fill_value, - rescale=rescale, + scale_offset=asset_scale_offset, gdal_env=gdal_env, errors_as_nodata=errors_as_nodata, ),