Scale offset from item asset (#202)

RSchueder · web-flow · commit 7836a363b260 · 2023-07-23T10:23:13.000-06:00
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ It's a good idea to use `conda` to handle installing rasterio on Windows. There'
 * Transfer the STAC metadata into [xarray coordinates](http://xarray.pydata.org/en/stable/data-structures.html#coordinates) for easy indexing, filtering, and provenance of metadata.
 * Efficiently generate a Dask graph for loading the data in parallel.
 * Mediate between Dask's parallelism and GDAL's aversion to it, allowing for fast, multi-threaded reads when possible, and at least preventing segfaults when not.
-* Mask nodata and rescale by dataset-level scales/offsets.
+* Mask nodata and rescale by STAC item asset scales/offsets.
 * Display data in interactive maps in a notebook, computed on the fly by Dask.
 
 ## Limitations:
diff --git a/stackstac/prepare.py b/stackstac/prepare.py
@@ -23,10 +23,11 @@
 import xarray as xr
 
 from .raster_spec import IntFloat, Bbox, Resolutions, RasterSpec
+
 from .stac_types import ItemSequence
 from . import accumulate_metadata, geom_utils
 
-ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4)])
+ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4), ("scale_offset", "float64", 2)])
 
 
 class Mimetype(NamedTuple):
@@ -143,6 +144,22 @@ def prepare_items(
             asset_bbox = asset.get("proj:bbox", item_bbox)
             asset_shape = asset.get("proj:shape", item_shape)
             asset_transform = asset.get("proj:transform", item_transform)
+            raster_bands = asset.get('raster:bands')
+
+            if raster_bands is not None:
+                if len(raster_bands) != 1:
+                    raise ValueError(
+                        f"raster:bands has {len(raster_bands)} elements for asset {asset_id!r}. "
+                        "Multi-band rasters are not currently supported.\n"
+                        "If you don't care about this asset, you can skip it by giving a list "
+                        "of asset IDs you *do* want in `assets=`, and leaving this one out."
+                    )
+                asset_scale = raster_bands[0].get('scale', 1)
+                asset_offset = raster_bands[0].get('offset', 0)
+            else:
+                asset_scale = 1
+                asset_offset = 0
+
             asset_affine = None
 
             # Auto-compute CRS
@@ -322,7 +339,7 @@ def prepare_items(
                     continue
 
             # Phew, we figured out all the spatial stuff! Now actually store the information we care about.
-            asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj)
+            asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj, (asset_scale, asset_offset))
             # ^ NOTE: If `asset_bbox_proj` is None, NumPy automatically converts it to NaNs
 
     # At this point, everything has been set (or there was as error)
diff --git a/stackstac/reader_protocol.py b/stackstac/reader_protocol.py
@@ -34,7 +34,7 @@ def __init__(
         resampling: Resampling,
         dtype: np.dtype,
         fill_value: Union[int, float],
-        rescale: bool,
+        scale_offset: Tuple[Union[int, float], Union[int, float]],
         gdal_env: Optional[LayeredEnv],
         errors_as_nodata: Tuple[Exception, ...] = (),
     ) -> None:
@@ -55,8 +55,6 @@ def __init__(
         fill_value:
             Fill nodata pixels in the output array with this value.
             If None, whatever nodata value is set in the asset will be used.
-        rescale:
-            Rescale the output array according to any scales and offsets set in the asset.
         gdal_env:
             A `~.LayeredEnv` of GDAL configuration options to use while opening
             and reading datasets. If None (default), `~.DEFAULT_GDAL_ENV` is used.
diff --git a/stackstac/rio_reader.py b/stackstac/rio_reader.py
@@ -32,7 +32,6 @@ def _curthread():
 
 # /TODO
 
-
 # Default GDAL configuration options
 DEFAULT_GDAL_ENV = LayeredEnv(
     always=dict(
@@ -64,11 +63,12 @@ def _curthread():
 # See `ThreadLocalRioDataset` for more.
 # https://github.com/pangeo-data/pangeo-example-notebooks/issues/21#issuecomment-432457955
 # https://gdal.org/drivers/raster/vrt.html#multi-threading-issues
+
 MULTITHREADED_DRIVER_ALLOWLIST = {"GTiff"}
 
 
 class ThreadsafeRioDataset(Protocol):
-    scale_offset: Tuple[float, float]
+    scale_offset: Tuple[Union[int, float], Union[int, float]]
 
     def read(self, window: Window, **kwargs) -> np.ndarray:
         ...
@@ -280,7 +280,7 @@ class PickleState(TypedDict):
     resampling: Resampling
     dtype: np.dtype
     fill_value: Union[int, float]
-    rescale: bool
+    scale_offset: Tuple[Union[int, float], Union[int, float]]
     gdal_env: Optional[LayeredEnv]
     errors_as_nodata: Tuple[Exception, ...]
 
@@ -303,16 +303,16 @@ def __init__(
         resampling: Resampling,
         dtype: np.dtype,
         fill_value: Union[int, float],
-        rescale: bool,
+        scale_offset: Tuple[Union[int, float], Union[int, float]],
         gdal_env: Optional[LayeredEnv] = None,
         errors_as_nodata: Tuple[Exception, ...] = (),
     ) -> None:
         self.url = url
         self.spec = spec
         self.resampling = resampling
         self.dtype = dtype
-        self.rescale = rescale
         self.fill_value = fill_value
+        self.scale_offset = scale_offset
         self.gdal_env = gdal_env or DEFAULT_GDAL_ENV
         self.errors_as_nodata = errors_as_nodata
 
@@ -398,12 +398,12 @@ def read(self, window: Window, **kwargs) -> np.ndarray:
 
             raise RuntimeError(msg) from e
 
-        if self.rescale:
-            scale, offset = reader.scale_offset
-            if scale != 1:
-                result *= scale
-            if offset != 0:
-                result += offset
+        scale, offset = self.scale_offset
+
+        if scale != 1:
+            result *= scale
+        if offset != 0:
+            result += offset
 
         result = np.ma.filled(result, fill_value=self.fill_value)
         assert np.issubdtype(result.dtype, self.dtype), (
@@ -436,7 +436,7 @@ def __getstate__(
             "resampling": self.resampling,
             "dtype": self.dtype,
             "fill_value": self.fill_value,
-            "rescale": self.rescale,
+            "scale_offset": self.scale_offset,
             "gdal_env": self.gdal_env,
             "errors_as_nodata": self.errors_as_nodata,
         }
diff --git a/stackstac/stack.py b/stackstac/stack.py
@@ -202,7 +202,8 @@ def stack(
         be represented in a smaller data type (like ``uint16``), using a different ``fill_value``
         (like 0) and managing it yourself could save a lot of memory.
     rescale:
-        Whether to rescale pixel values by the scale and offset set on the dataset.
+        Whether to rescale pixel values by the scale and offset present in the ``raster:bands`` metadata
+        for each asset.
         Default: True. Note that this could produce floating-point data when the
         original values are ints, so set ``dtype`` accordingly. You will NOT be warned
         if the cast to ``dtype`` is losing information!
diff --git a/stackstac/to_dask.py b/stackstac/to_dask.py
@@ -132,6 +132,10 @@ def asset_table_to_reader_and_window(
         if url:
             asset_bounds: Bbox = asset_entry["bounds"]
             asset_window = windows.from_bounds(*asset_bounds, spec.transform)
+            if rescale:
+                asset_scale_offset = asset_entry["scale_offset"]
+            else:
+                asset_scale_offset = (1, 0)
 
             entry: ReaderTableEntry = (
                 reader(
@@ -140,7 +144,7 @@ def asset_table_to_reader_and_window(
                     resampling=resampling,
                     dtype=dtype,
                     fill_value=fill_value,
-                    rescale=rescale,
+                    scale_offset=asset_scale_offset,
                     gdal_env=gdal_env,
                     errors_as_nodata=errors_as_nodata,
                 ),