diff --git a/CHANGES.md b/CHANGES.md index a4cdee3..db5bb28 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,15 @@ +* Add `RDPS_CRIM` and `HRDPS_CRIM` implementations. +* Add `cf` extension adding CF Parameter metadata to (H)RDPS stac asset and items. +* Add `cf` and `file` helpers. +* Add `providers` and `contacts` extensions metdata to (H)RDPS stac collection. +* Fix deprecated access to `model_fields` in `BaseSTAC` data model class. +* Fix bug service type check in extensions' `get_assets` methods. +* Fix return type of `from_data` in `THREDDSCatalogDataModel`. +* Update RDPS and HRDPS tests. + ## [0.11.0](https://github.com/crim-ca/stac-populator/tree/0.11.0) (2025-11-17) * Add option to automatically update collection extents and summaries based on ingested items. diff --git a/README.md b/README.md index 34fce15..77df849 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,14 @@ Provided implementations of `STACpopulatorBase`: | Implementation | Description | |----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------| +| [RDPS_CRIM][RDPS_CRIM] | Crawls a THREDDS Catalog for RDPS NCML-annotated NetCDF references to publish corresponding STAC Collection and Items. | +| [HRDPS_CRIM][HRDPS_CRIM] | Crawls a THREDDS Catalog for HRDPS NCML-annotated NetCDF references to publish corresponding STAC Collection and Items. | | [CMIP6_UofT][CMIP6_UofT] | Crawls a THREDDS Catalog for CMIP6 NCML-annotated NetCDF references to publish corresponding STAC Collection and Items. | | [DirectoryLoader][DirLoader] | Crawls a subdirectory hierarchy of pre-generated STAC Collections and Items to publish to a STAC API endpoint. | | [CORDEX-CMIP6_Ouranos][CORDEX-CMIP6_Ouranos] | Crawls a THREDDS Catalog for CORDEX-CMIP6 NetCDF references to publish corresponding STAC Collection and Items. | +[RDPS_CRIM]: STACpopulator/implementations/RDPS_CRIM/add_RDPS.py +[HRDPS_CRIM]: STACpopulator/implementations/HRDPS_CRIM/add_HRDPS.py [CMIP6_UofT]: STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py [DirLoader]: STACpopulator/implementations/DirectoryLoader/crawl_directory.py [CORDEX-CMIP6_Ouranos]: STACpopulator/implementations/CORDEX-CMIP6_Ouranos/add_CORDEX-CMIP6.py diff --git a/STACpopulator/extensions/base.py b/STACpopulator/extensions/base.py index 357ada2..cce94b5 100644 --- a/STACpopulator/extensions/base.py +++ b/STACpopulator/extensions/base.py @@ -64,6 +64,16 @@ class Helper: """Class to be subclassed by extension helpers.""" + @classmethod + @abstractmethod + def from_data( + cls, + data: dict[str, Any], + **kwargs, + ) -> "Helper": + """Create a Helper instance from raw data.""" + pass + class ExtensionHelper(BaseModel, Helper): """Base class for dataset properties going into the catalog. @@ -190,7 +200,8 @@ def create_uid(self) -> str: @model_validator(mode="after") def find_helpers(self) -> "BaseSTAC": """Populate the list of extensions.""" - for key, field in self.model_fields.items(): + # Access model fields from class. From obj will be removed in pydantic v3 + for key, field in type(self).model_fields.items(): if isinstance(field.annotation, type) and issubclass(field.annotation, Helper): self._helpers.append(key) return self @@ -328,8 +339,8 @@ def get_assets( return { key: asset for key, asset in self.item.get_assets().items() - if (service_type is ServiceType and service_type.value in asset.extra_fields) - or any(ServiceType.from_value(field, default=None) is ServiceType for field in asset.extra_fields) + if (isinstance(service_type, ServiceType) and service_type.value in asset.extra_fields) + or any(ServiceType.from_value(field, default=False) for field in asset.extra_fields) } def __repr__(self) -> str: diff --git a/STACpopulator/extensions/cf.py b/STACpopulator/extensions/cf.py new file mode 100644 index 0000000..ff15cf2 --- /dev/null +++ b/STACpopulator/extensions/cf.py @@ -0,0 +1,239 @@ +"""CF Extension Module.""" + +from __future__ import annotations + +import functools +from typing import ( + Any, + Dict, + Generic, + Iterable, + List, + Literal, + Optional, + TypeVar, + Union, + cast, + get_args, +) + +import pystac +from pydantic import BaseModel +from pystac.extensions import item_assets +from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension + +from STACpopulator.extensions.base import ExtensionHelper +from STACpopulator.stac_utils import ServiceType + +T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset) +SchemaName = Literal["cf"] +SCHEMA_URI = "https://stac-extensions.github.io/cf/v0.2.0/schema.json" +PREFIX = f"{get_args(SchemaName)[0]}:" +PARAMETER_PROP = PREFIX + "parameter" + + +class CFParameter(BaseModel): + """CFParameter.""" + + name: str + unit: str + + def __repr__(self) -> str: + """Return string repr.""" + return f"" + + +class CFHelper(ExtensionHelper): + """CFHelper.""" + + _prefix: str = "cf" + variables: Dict[str, Any] + + @functools.cached_property + def parameters(self) -> List[CFParameter]: + """Extracts cf:parameter-like information from item_data.""" + parameters = [] + + for var in self.variables.values(): + attrs = var.get("attributes", {}) + name = attrs.get("standard_name") # Get the required standard name + if not name: + continue # Skip if no valid name + unit = attrs.get("units", "") + parameters.append(CFParameter(name=name, unit=unit)) + + return parameters + + @classmethod + def from_data( + cls, + data: dict[str, Any], + **kwargs, + ) -> "CFHelper": + """Create a CFHelper instance from raw data.""" + return cls(variables=data["data"]["variables"], **kwargs) + + def apply(self, item: T, add_if_missing: bool = True) -> T: + """Apply the Datacube extension to an item.""" + ext = CFExtension.ext(item, add_if_missing=add_if_missing) + ext.apply(parameters=self.parameters) + + # FIXME: This temporary workaround has been added to comply with the (most certainly buggy) validation schema for CF extension + # It should be remove once the PR is integrated since applying on the item should be enough + asset = item.assets["HTTPServer"] + cf_asset_ext = CFExtension.ext(asset, add_if_missing=True) + cf_asset_ext.apply(parameters=self.parameters) + return item + + +class CFExtension( + Generic[T], + PropertiesExtension, + ExtensionManagementMixin[Union[pystac.Asset, pystac.Item, pystac.Collection]], +): + """CF Metadata Extension.""" + + @property + def name(self) -> SchemaName: + """Return the schema name.""" + return get_args(SchemaName)[0] + + @property + def parameter(self) -> List[dict[str, Any]] | None: + """Get or set the CF parameter(s).""" + return self._get_property(PARAMETER_PROP, int) + + @parameter.setter + def parameter(self, v: List[dict[str, Any]] | None) -> None: + self._set_property(PARAMETER_PROP, v) + + def apply( + self, + parameters: Union[List[CFParameter], List[dict[str, Any]]], + ) -> None: + """Apply CF Extension properties to the extended :class:`~pystac.Item` or :class:`~pystac.Asset`.""" + if not isinstance(parameters[0], dict): + parameters = [p.model_dump() for p in parameters] + self.parameter = parameters + + @classmethod + def get_schema_uri(cls) -> str: + """Return this extension's schema URI.""" + return SCHEMA_URI + + @classmethod + def ext(cls, obj: T, add_if_missing: bool = False) -> CFExtension[T]: + """Extend the given STAC Object with properties from the :stac-ext:`CF Extension `. + + This extension can be applied to instances of :class:`~pystac.Item`, :class:`~pystac.Asset`, or :class:`~pystac.Collection`. + + Raises + ------ + pystac.ExtensionTypeError : If an invalid object type is passed. + """ + if isinstance(obj, pystac.Collection): + cls.ensure_has_extension(obj, add_if_missing) + return cast(CFExtension[T], CollectionCFExtension(obj)) + elif isinstance(obj, pystac.Item): + cls.ensure_has_extension(obj, add_if_missing) + return cast(CFExtension[T], ItemCFExtension(obj)) + elif isinstance(obj, pystac.Asset): + cls.ensure_owner_has_extension(obj, add_if_missing) + return cast(CFExtension[T], AssetCFExtension(obj)) + elif isinstance(obj, item_assets.AssetDefinition): + cls.ensure_owner_has_extension(obj, add_if_missing) + return cast(CFExtension[T], ItemAssetsCFExtension(obj)) + else: + raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) + + +class ItemCFExtension(CFExtension[pystac.Item]): + """ + A concrete implementation of :class:`CFExtension` on an :class:`~pystac.Item`. + + Extends the properties of the Item to include properties defined in the + :stac-ext:`CF Extension `. + + This class should generally not be instantiated directly. Instead, call + :meth:`CFExtension.ext` on an :class:`~pystac.Item` to extend it. + """ + + def __init__(self, item: pystac.Item) -> None: + self.item = item + self.properties = item.properties + + def get_assets( + self, + service_type: Optional[ServiceType] = None, + ) -> dict[str, pystac.Asset]: + """Get the item's assets where eo:bands are defined. + + Args: + service_type: If set, filter the assets such that only those with a + matching :class:`~STACpopulator.stac_utils.ServiceType` are returned. + + Returns + ------- + Dict[str, Asset]: A dictionary of assets that match ``service_type`` + if set or else all of this item's assets were service types are defined. + """ + return { + key: asset + for key, asset in self.item.get_assets().items() + if (isinstance(service_type, ServiceType) and service_type.value in asset.extra_fields) + or any(ServiceType.from_value(field, default=False) for field in asset.extra_fields) + } + + def __repr__(self) -> str: + """Return repr.""" + return f"" + + +class ItemAssetsCFExtension(CFExtension[item_assets.AssetDefinition]): + """Extention for CF item assets.""" + + properties: dict[str, Any] + asset_defn: item_assets.AssetDefinition + + def __init__(self, item_asset: item_assets.AssetDefinition) -> None: + self.asset_defn = item_asset + self.properties = item_asset.properties + + +class AssetCFExtension(CFExtension[pystac.Asset]): + """ + A concrete implementation of :class:`CFExtension` on an :class:`~pystac.Asset`. + + Extends the Asset fields to include properties defined in the + :stac-ext:`CF Extension `. + + This class should generally not be instantiated directly. Instead, call + :meth:`CFExtension.ext` on an :class:`~pystac.Asset` to extend it. + """ + + asset_href: str + """The ``href`` value of the :class:`~pystac.Asset` being extended.""" + + properties: dict[str, Any] + """The :class:`~pystac.Asset` fields, including extension properties.""" + + additional_read_properties: Optional[Iterable[dict[str, Any]]] = None + """If present, this will be a list containing 1 dictionary representing the + properties of the owning :class:`~pystac.Item`.""" + + def __init__(self, asset: pystac.Asset) -> None: + self.asset_href = asset.href + self.properties = asset.extra_fields + if asset.owner and isinstance(asset.owner, pystac.Item): + self.additional_read_properties = [asset.owner.properties] + + def __repr__(self) -> str: + """Return repr.""" + return f"" + + +class CollectionCFExtension(CFExtension[pystac.Collection]): + """Extension for CF data.""" + + def __init__(self, collection: pystac.Collection) -> None: + self.collection = collection diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py index b7596c8..e4fa172 100644 --- a/STACpopulator/extensions/cmip6.py +++ b/STACpopulator/extensions/cmip6.py @@ -308,8 +308,8 @@ def get_assets( return { key: asset for key, asset in self.item.get_assets().items() - if (service_type is ServiceType and service_type.value in asset.extra_fields) - or any(ServiceType.from_value(field, default=None) is ServiceType for field in asset.extra_fields) + if (isinstance(service_type, ServiceType) and service_type.value in asset.extra_fields) + or any(ServiceType.from_value(field, default=False) for field in asset.extra_fields) } def __repr__(self) -> str: diff --git a/STACpopulator/extensions/datacube.py b/STACpopulator/extensions/datacube.py index 6167acc..19cad36 100644 --- a/STACpopulator/extensions/datacube.py +++ b/STACpopulator/extensions/datacube.py @@ -141,6 +141,15 @@ def __init__(self, attrs: MutableMapping[str, Any]) -> None: }, } + @classmethod + def from_data( + cls, + data: dict[str, Any], + **kwargs, + ) -> "DataCubeHelper": + """Create a DataCubeHelper instance from raw data.""" + return cls(attrs=data["data"]) + @property @functools.cache def dimensions(self) -> dict[str, Dimension]: @@ -213,9 +222,11 @@ def variables(self) -> dict[str, Variable]: else: dtype = VariableType.DATA.value + dimensions = meta.get("shape", []) + variables[name] = Variable( properties=dict( - dimensions=meta["shape"], + dimensions=[] if dimensions == [""] else dimensions, type=dtype, description=attrs.get("description", attrs.get("long_name", "")), unit=attrs.get("units", ""), diff --git a/STACpopulator/extensions/file.py b/STACpopulator/extensions/file.py new file mode 100644 index 0000000..a026b01 --- /dev/null +++ b/STACpopulator/extensions/file.py @@ -0,0 +1,103 @@ +"""FileHelper module.""" + +import functools +import logging +from typing import Dict, Optional, TypeVar + +import pystac +import requests +from pydantic import ConfigDict, Field +from pystac.extensions.file import FileExtension +from requests import Session + +from STACpopulator.extensions.base import ExtensionHelper + +# Constants +T = TypeVar("T", pystac.Asset, pystac.Link) +logger = logging.getLogger(__name__) + + +class FileHelper(ExtensionHelper): + """Helper to handle file info from elements of types Asset and Link.""" + + access_urls: Dict[str, str] + asset_key: str = "HTTPServer" + session: Optional[Session] = Field(default=None, exclude=True) + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__( + self, + access_urls: dict[str, str], + asset_key: str = "HTTPServer", + session: Optional[Session] = None, + ) -> None: + """Initialize the file helper. + + Parameters + ---------- + access_urls : dict[str, str] + Dictionary of catalog access URLs. + asset_key : str. + Asset key matching main file in access_urls. Defaults to `HTTPServer`. + session : requests.Session, optional + Requests session object to use for HTTP requests. Defaults to `requests.Session()`. + """ + super().__init__( + access_urls=access_urls, + asset_key=asset_key, + session=session or requests.Session(), + ) + + @classmethod + def from_data( + cls, + data: dict[str, any], + **kwargs, + ) -> "FileHelper": + """Create a FileHelper instance from raw data.""" + return cls(access_urls=data["data"]["access_urls"], **kwargs) + + def apply(self, item: pystac.Item, add_if_missing: bool = True) -> T: + """Apply the FileExtension to an asset.""" + # FIXME: This extension is applicable to Assets and Links. + # Currently applied to the HTTPServer asset by default to avoid heavy load during populator run + asset = item.assets[self.asset_key] + file_ext = FileExtension.ext(asset, add_if_missing=add_if_missing) + file_ext.apply( + size=self.size, + byte_order=None, # NOTE: Appears to be variable-related. Unclear what would be the value for the whole file. + header_size=None, # NOTE: No utility yet available. Might not be relevant. + checksum=None, # NOTE: Should be made available in the metadata on THREDDS catalog + values=None, # NOTE: Deprecated field + local_path=None, # NOTE: Seems to be irrelevant + ) + + @functools.cached_property + def size(self) -> Optional[int]: + """Get file size in bytes. + + Returns + ------- + (int): File size in bytes or None if: + - the `asset_key` is not in `access_urls` dict + - the server does include a Content-Length header + - the Content-Length value is non-numeric. + """ + if self.asset_key not in self.access_urls: + logger.warning("Asset key %s is not present in access URLs.", self.asset_key) + return None + res = self.session.head(self.access_urls[self.asset_key]) + res.raise_for_status() + + content_length = res.headers.get("Content-Length") + try: + return int(content_length) + except (TypeError, ValueError) as e: + logger.exception( + "Failed to parse Content-Length header (%r) for asset key %s", + content_length, + self.asset_key, + exc_info=e, + ) + return None diff --git a/STACpopulator/extensions/hrdps.py b/STACpopulator/extensions/hrdps.py new file mode 100644 index 0000000..3d3ba2c --- /dev/null +++ b/STACpopulator/extensions/hrdps.py @@ -0,0 +1,9 @@ +from STACpopulator.extensions.rdps import RDPSDataModel + + +# Customize the THREDDSCatalogDataModel +class HRDPSDataModel(RDPSDataModel): + """Data model for HRDPS NetCDF datasets.""" + + # FIXME: No specific props beyond RPDS. Kept to facilitate evolution. + pass diff --git a/STACpopulator/extensions/rdps.py b/STACpopulator/extensions/rdps.py new file mode 100644 index 0000000..64cc09b --- /dev/null +++ b/STACpopulator/extensions/rdps.py @@ -0,0 +1,11 @@ +from STACpopulator.extensions.cf import CFHelper +from STACpopulator.extensions.file import FileHelper +from STACpopulator.extensions.thredds import THREDDSCatalogDataModel + + +class RDPSDataModel(THREDDSCatalogDataModel): + """Data model for RDPS NetCDF datasets.""" + + # Extension classes + cf: CFHelper + file: FileHelper diff --git a/STACpopulator/extensions/thredds.py b/STACpopulator/extensions/thredds.py index c352c2b..ae49dca 100644 --- a/STACpopulator/extensions/thredds.py +++ b/STACpopulator/extensions/thredds.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +import inspect from typing import Generic, TypeVar, Union, cast import pystac @@ -68,7 +71,7 @@ def get_schema_uri(cls) -> str: return "" @classmethod - def ext(cls, obj: T, add_if_missing: bool = False) -> "THREDDSExtension[T]": + def ext(cls, obj: T, add_if_missing: bool = False) -> THREDDSExtension[T]: """Extend the given STAC Object with properties from the :stac-ext:`THREDDS Extension `. This extension can be applied to instances of :class:`~pystac.Item` or @@ -135,6 +138,15 @@ class THREDDSHelper(Helper): def __init__(self, access_urls: dict[str, str]) -> None: self.access_urls = {ServiceType.from_value(svc): url for svc, url in access_urls.items()} + @classmethod + def from_data( + cls, + data: dict[str, any], + **kwargs, + ) -> THREDDSHelper: + """Create a THREDDSHelper instance from raw data.""" + return cls(access_urls=data["data"]["access_urls"]) + @property def services(self) -> list[THREDDSService]: """Return a list of THREDDS services including one for this helper.""" @@ -180,13 +192,16 @@ class THREDDSCatalogDataModel(BaseSTAC): model_config = ConfigDict(populate_by_name=True, extra="ignore", arbitrary_types_allowed=True) @classmethod - def from_data(cls, data: dict) -> None: + def from_data(cls, data: dict, **kwargs) -> THREDDSCatalogDataModel: """ Instantiate class from data provided by THREDDS Loader. This is where we match the Loader's output to the STAC item and extensions inputs. If we had multiple loaders, that's probably the only thing that would be different between them. """ + # Inject kwargs for helpers into data + data["_extra_kwargs"] = kwargs + return cls( data=data, start_datetime=data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], @@ -197,16 +212,38 @@ def from_data(cls, data: dict) -> None: @model_validator(mode="before") @classmethod - def datacube_helper(cls, data: dict) -> dict: - """Instantiate the DataCubeHelper.""" - data["datacube"] = DataCubeHelper(data["data"]) - return data - - @model_validator(mode="before") - @classmethod - def thredds_helper(cls, data: dict) -> dict: - """Instantiate the THREDDSHelper.""" - data["thredds"] = THREDDSHelper(data["data"]["access_urls"]) + def instantiate_helpers(cls, data: dict[str, any]) -> dict[str, any]: + """Automatically instantiate helper fields before model initialization. + + This method detects all fields annotated as subclasses of `Helper` + and populates them by calling their respective `from_data()` constructors. + Any extra keyword arguments are forwarded to helpers that accept them. + + Parameters + ---------- + data : dict[str, Any] + The raw input dictionary of parameters used to construct this class. + + Returns + ------- + dict[str, Any] + The modified data dictionary with instantiated helper objects injected + into their corresponding fields. + """ + # Retrieve forwarded kwargs and remove from the data object + kwargs = data["data"].pop("_extra_kwargs", {}) + + # Iterate over model fields and find helpers + for field_name, field in cls.model_fields.items(): + field_type = field.annotation + if isinstance(field_type, type) and issubclass(field_type, Helper): + # if helper not provided in constructor + if field_name not in data: + # Filter kwargs to only include those accepted by the helper's constructor. + type_signature = inspect.signature(field_type.__init__) + accepted_kwargs = {k: v for k, v in kwargs.items() if k in type_signature.parameters} + # Instantiate helper and forward accepted kwargs + data[field_name] = field_type.from_data(data, **accepted_kwargs) return data def create_uid(self) -> str: diff --git a/STACpopulator/implementations/HRDPS_CRIM/__init__.py b/STACpopulator/implementations/HRDPS_CRIM/__init__.py new file mode 100644 index 0000000..37a1759 --- /dev/null +++ b/STACpopulator/implementations/HRDPS_CRIM/__init__.py @@ -0,0 +1,3 @@ +from .add_HRDPS import add_parser_args, runner + +__all__ = ["add_parser_args", "runner"] diff --git a/STACpopulator/implementations/HRDPS_CRIM/add_HRDPS.py b/STACpopulator/implementations/HRDPS_CRIM/add_HRDPS.py new file mode 100644 index 0000000..1fc02a2 --- /dev/null +++ b/STACpopulator/implementations/HRDPS_CRIM/add_HRDPS.py @@ -0,0 +1,66 @@ +import argparse +import logging +from typing import Any + +from requests.sessions import Session + +from STACpopulator.extensions.hrdps import HRDPSDataModel +from STACpopulator.input import ErrorLoader, THREDDSLoader +from STACpopulator.populator_base import STACpopulatorBase + +LOGGER = logging.getLogger(__name__) + + +class HRDPSpopulator(STACpopulatorBase): + """Populator that creates STAC objects representing HRDPS data from a THREDDS catalog.""" + + data_model = HRDPSDataModel + item_geometry_model = None # Unnecessary, but kept for consistency + + def create_stac_item(self, item_name: str, item_data: dict[str, Any]) -> dict[str, Any]: + """Return a STAC item.""" + dm = self.data_model.from_data(item_data) + return dm.stac_item() + + +def add_parser_args(parser: argparse.ArgumentParser) -> None: + """Add additional CLI arguments to the argument parser.""" + parser.description = "HRDPS STAC populator from a THREDDS catalog or NCML XML." + parser.add_argument("stac_host", help="STAC API URL") + parser.add_argument("href", help="URL to a THREDDS catalog or a NCML XML with HRDPS metadata.") + parser.add_argument("--update", action="store_true", help="Update collection and its items") + parser.add_argument( + "--mode", + choices=["full", "single"], + default="full", + help="Operation mode, processing the full dataset or only the single reference.", + ) + parser.add_argument( + "--config", + type=str, + help=( + "Override configuration file for the populator. " + "By default, uses the adjacent configuration to the implementation class." + ), + ) + + +def runner(ns: argparse.Namespace, session: Session) -> int: + """Run the populator.""" + LOGGER.info(f"Arguments to call: {vars(ns)}") + + if ns.mode == "full": + data_loader = THREDDSLoader(ns.href, session=session) + else: + # To be implemented + data_loader = ErrorLoader() + + c = HRDPSpopulator( + ns.stac_host, + data_loader, + update=ns.update, + session=session, + config_file=ns.config, + ) + c.ingest() + return 0 diff --git a/STACpopulator/implementations/HRDPS_CRIM/collection_config.yml b/STACpopulator/implementations/HRDPS_CRIM/collection_config.yml new file mode 100644 index 0000000..62216d6 --- /dev/null +++ b/STACpopulator/implementations/HRDPS_CRIM/collection_config.yml @@ -0,0 +1,148 @@ +title: HRDPS +id: HRDPS_CRIM +description: High Resolution Deterministic Prediction System +keywords: + [ + "HRDPS", + "ECCC", + "MSC", + "Climate Change", + "Prediction", + "High resolution", + "Deterministic", + "Meteorological data", + "Weather and Climate", + "Meteorological Service of Canada", + "Environment and Climate Change Canada", + "Weather and Environmental Operations", + ] +license: other +spatialextent: [-152.7685268, 27.2840148, -40.6938032, 70.6164825] +temporalextent: ["2014-11-18", "2025-04-01"] + +providers: + - name: ECCC + description: Environment and Climate Change Canada (ECCC) + roles: + - licensor + - processor + - producer + url: https://github.com/julemai/CaSPAr/wiki/Available-products + - name: MSC + description: Meteorological Service of Canada (MSC) + roles: + - processor + url: https://eccc-msc.github.io/open-data + - name: Ouranos + description: Ouranos + roles: + - processor + - host + url: https://www.ouranos.ca + +links: + - rel: about + title : Project homepage + target : https://open.canada.ca/data/en/dataset/5b401fa0-6c29-57f0-b3d5-749f301d829d + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: about + title : Page d'accueil du projet + target : https://open.canada.ca/data/en/dataset/5b401fa0-6c29-57f0-b3d5-749f301d829d + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: author + title: Environment and Climate Change Canada (ECCC) + target: https://www.canada.ca/en/environment-climate-change.html + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: author + title: Environnement et Changement Climatique Canada (ECCC) + target: https://www.canada.ca/fr/environnement-changement-climatique.html + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: describedby + title: Data and Products of the High Resolution Deterministic Prediction System (HRDPS) + target: https://eccc-msc.github.io/open-data/msc-data/nwp_hrdps/readme_hrdps_en + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: describedby + title: Données et Produits du Système à haute résolution de prévision déterministe (SHRPD) + target: https://eccc-msc.github.io/open-data/msc-data/nwp_hrdps/readme_hrdps_fr + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: license + title: Environment and Climate Change Canada Data Servers End-use License + target: https://eccc-msc.github.io/open-data/licence/readme_en/ + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: license + title: Licence d’utilisation finale pour les serveurs de données d’Environnement et Changement climatique Canada + target: https://eccc-msc.github.io/open-data/licence/readme_fr/ + media_type: text/html + extra_fields: + hreflang: fr-CA + +contacts: + - organization: Computer Research Institute of Montreal (CRIM) + identifier: CRIM + position: Support + roles: + - indexer + emails: + - value: support-geo@crim.ca + roles: + - work + logo: + href: https://raw.githubusercontent.com/henriaidasso/image-store/refs/heads/main/img/crim.png + rel: icon + type: image/png + title: CRIM + links: + - rel: about + title: Site web + target: https://www.crim.ca/fr/ + media_type: text/html + extra_fields: + hreflang: fr-CA + - organization: Environment and Climate Change Canada (ECCC) + identifier: ECCC + position: Support + roles: + - licensor + - processor + - producer + emails: + - value: ECWeather-Meteo@ec.gc.ca + roles: + - work + phones: + - value: +18199972800 + roles: + - work + addresses: + - deliveryPoint: 77 Westmorland Street, suite 260 + city: Fredericton + administrativeArea: New Brunswick + postalCode: E3B 6Z4 + country: Canada + links: + - rel: about + title: Web page + target: https://www.canada.ca/en/environment-climate-change.html + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: about + title: Site web + target: https://www.canada.ca/fr/environnement-changement-climatique.html + media_type: text/html + extra_fields: + hreflang: fr-CA diff --git a/STACpopulator/implementations/RDPS_CRIM/__init__.py b/STACpopulator/implementations/RDPS_CRIM/__init__.py new file mode 100644 index 0000000..7e1b1e1 --- /dev/null +++ b/STACpopulator/implementations/RDPS_CRIM/__init__.py @@ -0,0 +1,3 @@ +from .add_RDPS import add_parser_args, runner + +__all__ = ["add_parser_args", "runner"] diff --git a/STACpopulator/implementations/RDPS_CRIM/add_RDPS.py b/STACpopulator/implementations/RDPS_CRIM/add_RDPS.py new file mode 100644 index 0000000..e176659 --- /dev/null +++ b/STACpopulator/implementations/RDPS_CRIM/add_RDPS.py @@ -0,0 +1,66 @@ +import argparse +import logging +from typing import Any + +from requests.sessions import Session + +from STACpopulator.extensions.rdps import RDPSDataModel +from STACpopulator.input import ErrorLoader, THREDDSLoader +from STACpopulator.populator_base import STACpopulatorBase + +LOGGER = logging.getLogger(__name__) + + +class RDPSpopulator(STACpopulatorBase): + """Populator that creates STAC objects representing RDPS data from a THREDDS catalog.""" + + data_model = RDPSDataModel + item_geometry_model = None # Unnecessary, but kept for consistency + + def create_stac_item(self, item_name: str, item_data: dict[str, Any]) -> dict[str, Any]: + """Return a STAC item.""" + dm = self.data_model.from_data(item_data, session=self._session) + return dm.stac_item() + + +def add_parser_args(parser: argparse.ArgumentParser) -> None: + """Add additional CLI arguments to the argument parser.""" + parser.description = "RDPS STAC populator from a THREDDS catalog or NCML XML." + parser.add_argument("stac_host", help="STAC API URL") + parser.add_argument("href", help="URL to a THREDDS catalog or a NCML XML with RDPS metadata.") + parser.add_argument("--update", action="store_true", help="Update collection and its items") + parser.add_argument( + "--mode", + choices=["full", "single"], + default="full", + help="Operation mode, processing the full dataset or only the single reference.", + ) + parser.add_argument( + "--config", + type=str, + help=( + "Override configuration file for the populator. " + "By default, uses the adjacent configuration to the implementation class." + ), + ) + + +def runner(ns: argparse.Namespace, session: Session) -> int: + """Run the populator.""" + LOGGER.info(f"Arguments to call: {vars(ns)}") + + if ns.mode == "full": + data_loader = THREDDSLoader(ns.href, session=session) + else: + # To be implemented + data_loader = ErrorLoader() + + c = RDPSpopulator( + ns.stac_host, + data_loader, + update=ns.update, + session=session, + config_file=ns.config, + ) + c.ingest() + return 0 diff --git a/STACpopulator/implementations/RDPS_CRIM/collection_config.yml b/STACpopulator/implementations/RDPS_CRIM/collection_config.yml new file mode 100644 index 0000000..85180be --- /dev/null +++ b/STACpopulator/implementations/RDPS_CRIM/collection_config.yml @@ -0,0 +1,149 @@ +id: RDPS_CRIM +title: RDPS +description: Regional Deterministic Prediction System (RDPS) +keywords: + [ + "RDPS", + "ECCC", + "MSC", + "Climate Change", + "Prediction", + "Regional", + "Deterministic", + "Weather forecasts", + "Meteorological data", + "Weather and Climate", + "Meteorological Service of Canada", + "Environment and Climate Change Canada", + "Weather and Environmental Operations", + ] +license: other +spatialextent: [-166.3, 7.2, -14.4, 85.0] +temporalextent: ["1986-04-22", "2025-04-01"] + +providers: + - name: ECCC + description: Environment and Climate Change Canada (ECCC) + roles: + - licensor + - processor + - producer + url: https://github.com/julemai/CaSPAr/wiki/Available-products + - name: MSC + description: Meteorological Service of Canada (MSC) + roles: + - processor + url: https://eccc-msc.github.io/open-data + - name: Ouranos + description: Ouranos + roles: + - processor + - host + url: https://www.ouranos.ca + +links: + - rel: about + title: Project homepage + target: https://open.canada.ca/data/en/dataset/a9f2828c-0d78-5eb6-a4c7-1fc1219f1e3d + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: about + title: Page d'accueil du projet + target: https://open.canada.ca/data/fr/dataset/a9f2828c-0d78-5eb6-a4c7-1fc1219f1e3d + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: author + title: Environment and Climate Change Canada (ECCC) + target: https://www.canada.ca/en/environment-climate-change.html + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: author + title: Environnement et Changement Climatique Canada (ECCC) + target: https://www.canada.ca/fr/environnement-changement-climatique.html + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: describedby + title: Data and products of the Regional Deterministic Prediction System (RDPS) + target: https://eccc-msc.github.io/open-data/msc-data/nwp_rdps/readme_rdps_en + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: describedby + title: Données et produits du Système régional de prévision déterministe (SRPD) + target: https://eccc-msc.github.io/open-data/msc-data/nwp_rdps/readme_rdps_fr/ + media_type: text/html + extra_fields: + hreflang: fr-CA + - rel: license + title: Environment and Climate Change Canada Data Servers End-use License + target: https://eccc-msc.github.io/open-data/licence/readme_en/ + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: license + title: Licence d’utilisation finale pour les serveurs de données d’Environnement et Changement climatique Canada + target: https://eccc-msc.github.io/open-data/licence/readme_fr/ + media_type: text/html + extra_fields: + hreflang: fr-CA + +contacts: + - organization: Computer Research Institute of Montreal (CRIM) + identifier: CRIM + position: Support + roles: + - indexer + emails: + - value: support-geo@crim.ca + roles: + - work + logo: + href: https://raw.githubusercontent.com/henriaidasso/image-store/refs/heads/main/img/crim.png + rel: icon + type: image/png + title: CRIM + links: + - rel: about + title: Site web + target: https://www.crim.ca/fr/ + media_type: text/html + extra_fields: + hreflang: fr-CA + - organization: Environment and Climate Change Canada (ECCC) + identifier: ECCC + position: Support + roles: + - licensor + - processor + - producer + emails: + - value: ECWeather-Meteo@ec.gc.ca + roles: + - work + phones: + - value: +18199972800 + roles: + - work + addresses: + - deliveryPoint: 77 Westmorland Street, suite 260 + city: Fredericton + administrativeArea: New Brunswick + postalCode: E3B 6Z4 + country: Canada + links: + - rel: about + title: Web page + target: https://www.canada.ca/en/environment-climate-change.html + media_type: text/html + extra_fields: + hreflang: en-CA + - rel: about + title: Site web + target: https://www.canada.ca/fr/environnement-changement-climatique.html + media_type: text/html + extra_fields: + hreflang: fr-CA diff --git a/STACpopulator/implementations/__init__.py b/STACpopulator/implementations/__init__.py index fe16c15..bf97fb7 100644 --- a/STACpopulator/implementations/__init__.py +++ b/STACpopulator/implementations/__init__.py @@ -5,4 +5,4 @@ # - adds additional arguments to the given parser needed to run this implementation # - def runner(ns: argparse.Namespace) -> int: # - runs the implementation given a namespace constructed from the parser arguments supplied -__all__ = ["CMIP6_UofT", "DirectoryLoader", "CORDEXCMIP6_Ouranos"] +__all__ = ["CMIP6_UofT", "DirectoryLoader", "CORDEXCMIP6_Ouranos", "RDPS_CRIM", "HRDPS_CRIM"] diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 5b81d89..81957c5 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -144,6 +144,14 @@ def create_stac_collection(self) -> dict[str, Any]: # Add any assets if provided in the config self._collection_info["assets"] = self.__make_collection_assets() + # Cast providers to pystac objects + self._collection_info["providers"] = self.__make_collection_providers() + + # Add contacts as extra_field + if "contacts" in self._collection_info: + self._collection_info["extra_fields"] = {} + self._collection_info["extra_fields"]["contacts"] = self.__make_collection_contacts() + # Construct links if provided in the config. This needs to be done before constructing a collection object. collection_links = self.__make_collection_links() @@ -180,6 +188,29 @@ def __make_collection_assets(self) -> Dict[str, pystac.Asset]: pystac_assets[asset_name] = pystac.Asset(**asset_info) return pystac_assets + def __make_collection_providers(self) -> List[pystac.Provider]: + """Create collection level providers based on data read in from the configuration file. + + :return: List of pystac Provider objects + :rtype: List[pystac.Provider] + """ + pystac_providers = [] + if "providers" in self._collection_info: + providers = self._collection_info.pop("providers") + pystac_providers = [pystac.Provider(**provider) for provider in providers] + return pystac_providers + + def __make_collection_contacts(self) -> List[dict]: + """Create collection level contacts based on data read in from the configuration file. + + :return: List of dictionnary contact objects + :rtype: List[dict] + """ + contacts = [] + if "contacts" in self._collection_info: + contacts = self._collection_info.pop("contacts") + return contacts + def publish_stac_collection(self, collection_data: dict[str, Any]) -> None: """Publish this collection by uploading it to the STAC catalog at self.stac_host.""" post_stac_collection(self.stac_host, collection_data, self.update, session=self._session) diff --git a/tests/test_rdps.py b/tests/test_rdps.py index eeca56f..7d99a90 100644 --- a/tests/test_rdps.py +++ b/tests/test_rdps.py @@ -2,27 +2,62 @@ import json -from STACpopulator.extensions.thredds import THREDDSCatalogDataModel +from STACpopulator.extensions.hrdps import HRDPSDataModel +from STACpopulator.extensions.rdps import RDPSDataModel def test_rdps(): attrs = json.load(open("tests/data/rdps.json")) - model = THREDDSCatalogDataModel.from_data(attrs) + model = RDPSDataModel.from_data(attrs) item = model.stac_item() - assert set(model._helpers) == {"thredds", "datacube"} + assets = item["assets"] + cf_parameters = item["properties"].get("cf:parameter", []) + parameter_names = [param["name"] for param in cf_parameters] + + assert set(model._helpers) == {"thredds", "datacube", "cf", "file"} assert item["id"] == "birdhouse__testdata__HRDPS__RDPS_sample__2024010100_000.nc" + # DataCubeExtension + assert "cube:variables" in item["properties"] assert "TD" in item["properties"]["cube:variables"] + # CFExtension + assert any("cf:parameter" in asset for asset in assets.values()) + assert "cf:parameter" in item["properties"] + assert set(["time", "latitude", "longitude"]).issubset(set(parameter_names)) + # FileExtension + assert any("file:size" in asset for asset in assets.values()) -def test_hrdps(): +def test_hrdps_sfc(): attrs = json.load(open("tests/data/hrdps_sfc.json")) - model = THREDDSCatalogDataModel.from_data(attrs) + model = HRDPSDataModel.from_data(attrs) item = model.stac_item() + assets = item["assets"] + cf_parameters = item["properties"].get("cf:parameter", []) + parameter_names = [param["name"] for param in cf_parameters] + # DataCubeExtension assert "HRDPS_P_PR_SFC" in item["properties"]["cube:variables"] assert item["properties"]["cube:dimensions"].keys() == {"time", "rlat", "rlon"} + # CFExtension + assert any("cf:parameter" in asset for asset in assets.values()) + assert "cf:parameter" in item["properties"] + assert set(["time", "latitude", "longitude"]).issubset(set(parameter_names)) + # FileExtension + assert any("file:size" in asset for asset in assets.values()) + +def test_hrdps_p_tt(): attrs = json.load(open("tests/data/hrdps_p_tt.json")) - model = THREDDSCatalogDataModel.from_data(attrs) + model = HRDPSDataModel.from_data(attrs) item = model.stac_item() + assets = item["assets"] + cf_parameters = item["properties"].get("cf:parameter", []) + parameter_names = [param["name"] for param in cf_parameters] + # DataCubeExtension assert "HRDPS_P_TT_10000" in item["properties"]["cube:variables"] assert item["properties"]["cube:dimensions"].keys() == {"time", "rlat", "rlon"} + # CFExtension + assert any("cf:parameter" in asset for asset in assets.values()) + assert "cf:parameter" in item["properties"] + assert set(["time", "latitude", "longitude"]).issubset(set(parameter_names)) + # FileExtension + assert any("file:size" in asset for asset in assets.values())