From 1745f8f3ae06b188fdd696c3b6d0ae7f91ffe3d4 Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:03:22 -0500 Subject: [PATCH 1/6] Use clone_item to determine preserve_dict Currently the clone_item parameter only affects the case where the items are pystac.Items. The analogous use for a list of Dicts representing the Items is by using preserve_dict if the Items should be "cloned". This allows users to use clone_item=False to speed up reading ItemCollections in the case that they are passing in a list of Dicts. --- pystac/item_collection.py | 2 +- tests/test_item_collection.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pystac/item_collection.py b/pystac/item_collection.py index 206fee01a..8f1a6bf7f 100644 --- a/pystac/item_collection.py +++ b/pystac/item_collection.py @@ -88,7 +88,7 @@ def map_item(item_or_dict: ItemLike) -> pystac.Item: if isinstance(item_or_dict, pystac.Item): return item_or_dict.clone() if clone_items else item_or_dict else: - return pystac.Item.from_dict(item_or_dict) + return pystac.Item.from_dict(item_or_dict, preserve_dict=clone_items) self.items = list(map(map_item, items)) self.extra_fields = extra_fields or {} diff --git a/tests/test_item_collection.py b/tests/test_item_collection.py index 11603bd8d..2f3b976da 100644 --- a/tests/test_item_collection.py +++ b/tests/test_item_collection.py @@ -112,7 +112,7 @@ def test_from_relative_path(self) -> None: def test_from_list_of_dicts(self) -> None: item_dict = self.stac_io.read_json(self.SIMPLE_ITEM) - item_collection = pystac.ItemCollection(items=[item_dict]) + item_collection = pystac.ItemCollection(items=[item_dict], clone_items=True) self.assertEqual(item_collection[0].id, item_dict.get("id")) From 9278d501d133c1c15b40fb7b5ca056b548a96c32 Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:06:36 -0500 Subject: [PATCH 2/6] Take a Callable instead of a Type for set_default This shouldn't break any client code as a Type[StacIO] is itself a Callable[[], StacIO]. This allows flexibility in how we generate the StacIO object, which is useful in cases like mocking the StacIO object. --- pystac/stac_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pystac/stac_io.py b/pystac/stac_io.py index a0108566a..f5ddbc252 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -3,12 +3,12 @@ import json from typing import ( Any, + Callable, Dict, List, Optional, TYPE_CHECKING, Tuple, - Type, Union, ) @@ -37,7 +37,7 @@ class StacIO(ABC): - _default_io: Optional[Type["StacIO"]] = None + _default_io: Optional[Callable[[], "StacIO"]] = None @abstractmethod def read_text( @@ -251,7 +251,7 @@ def save_json( self.write_text(dest, txt) @classmethod - def set_default(cls, stac_io_class: Type["StacIO"]) -> None: + def set_default(cls, stac_io_class: Callable[[], "StacIO"]) -> None: """Set the default StacIO instance to use.""" cls._default_io = stac_io_class From a1a79e60f9bd6331cdda51dfdea1b7a8039c66ec Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:08:25 -0500 Subject: [PATCH 3/6] Add MockDefaultStacIO This allows the default StacIO to be set inside a context manager for tests that want to utilize a mock as the default StacIO. --- tests/utils/stac_io_mock.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/utils/stac_io_mock.py b/tests/utils/stac_io_mock.py index 8feac99b2..2c07d4b94 100644 --- a/tests/utils/stac_io_mock.py +++ b/tests/utils/stac_io_mock.py @@ -1,25 +1,49 @@ -from typing import Any, Union +from typing import Any, Optional, Union from unittest.mock import Mock import pystac +from pystac.stac_io import DefaultStacIO, StacIO class MockStacIO(pystac.StacIO): """Creates a mock that records StacIO calls for testing and allows clients to replace StacIO functionality, all within a context scope. + + Args: + wrapped_stac_io: The StacIO that will be used to perform the calls. + Defaults to an instance of DefaultStacIO. """ - def __init__(self) -> None: + mock: Mock + wrapped_stac_io: StacIO + + def __init__(self, wrapped_stac_io: Optional[StacIO] = None) -> None: self.mock = Mock() + if wrapped_stac_io is None: + self.wrapped_stac_io = DefaultStacIO() + else: + self.wrapped_stac_io = wrapped_stac_io def read_text( self, source: Union[str, pystac.Link], *args: Any, **kwargs: Any ) -> str: self.mock.read_text(source) - return pystac.StacIO.default().read_text(source) + return self.wrapped_stac_io.read_text(source) def write_text( self, dest: Union[str, pystac.Link], txt: str, *args: Any, **kwargs: Any ) -> None: self.mock.write_text(dest, txt) - pystac.StacIO.default().write_text(dest, txt) + self.wrapped_stac_io.write_text(dest, txt) + + +class MockDefaultStacIO(object): + """Context manager for mocking StacIO.""" + + def __enter__(self) -> MockStacIO: + mock = MockStacIO() + pystac.StacIO.set_default(lambda: mock) + return mock + + def __exit__(self, *args: Any) -> None: + pystac.StacIO.set_default(DefaultStacIO) From 640b7c8b7dfadcc6bb5f84183b32cb21a34afca1 Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:09:08 -0500 Subject: [PATCH 4/6] Test ItemCollection.to_dict won't read item roots This test, which fails without a subsequent fix, tests that there should only be a single read operation when convering an ItemCollection to a dict. This test fails due to the root link being read for each item in the logic that determines if heirarchial links should be absolute or relative. --- tests/test_item_collection.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_item_collection.py b/tests/test_item_collection.py index 2f3b976da..3ca79e157 100644 --- a/tests/test_item_collection.py +++ b/tests/test_item_collection.py @@ -7,6 +7,7 @@ import pystac from tests.utils import TestCases +from tests.utils.stac_io_mock import MockDefaultStacIO class TestItemCollection(unittest.TestCase): @@ -176,3 +177,11 @@ def test_from_dict_sets_root(self) -> None: item_collection = ItemCollection.from_dict(param_dict, root=catalog) for item in item_collection.items: self.assertEqual(item.get_root(), catalog) + + def test_to_dict_does_not_read_root_link_of_items(self) -> None: + with MockDefaultStacIO() as mock_stac_io: + item_collection = pystac.ItemCollection.from_file(self.ITEM_COLLECTION) + + item_collection.to_dict() + + self.assertEqual(mock_stac_io.mock.read_text.call_count, 1) From 8b6cec8d80f153183e72b9da8e442e1b7be76946 Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:11:33 -0500 Subject: [PATCH 5/6] Allow link href transformation logic to be skipped This adds a new option to determine if a link's href should be transformed based on the link's owner's root catalog type. The current implementation always checks the link's owner for a root catalog, and if it exists and if that root catalog is of type RELATIVE_PUBLISHED or SELF_CONTAINED, and that link is a hierarchical link, then the link's href is transformed to be relative to the link's owner's root catalog. The default option is to maintain this behavior for the default to_dict method, along with the Catalog, Collection and Item to_dict methods; but this flag now allows users to skip this link href transformation, saving reads. The ItemCollection.to_dict method was modified to skip link transformation by default. If Items exist in an ItemCollection, the likely desired behavior is not to modify the link hrefs based on the root catalog type, and the result of transforming hrefs in situations where the ItemCollection does not itself have a root catalog results in a root link resolution for each Item, which can be very slow. --- pystac/catalog.py | 6 ++++-- pystac/collection.py | 8 ++++++-- pystac/item.py | 6 ++++-- pystac/item_collection.py | 18 +++++++++++++++--- pystac/link.py | 29 +++++++++++++++++++++++++---- pystac/stac_object.py | 10 +++++++++- 6 files changed, 63 insertions(+), 14 deletions(-) diff --git a/pystac/catalog.py b/pystac/catalog.py index 80abf825a..3d00f3370 100644 --- a/pystac/catalog.py +++ b/pystac/catalog.py @@ -473,7 +473,9 @@ def get_item_links(self) -> List[Link]: """ return self.get_links(pystac.RelType.ITEM) - def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: + def to_dict( + self, include_self_link: bool = True, transform_hrefs: bool = True + ) -> Dict[str, Any]: links = self.links if not include_self_link: links = [x for x in links if x.rel != pystac.RelType.SELF] @@ -483,7 +485,7 @@ def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: "id": self.id, "stac_version": pystac.get_stac_version(), "description": self.description, - "links": [link.to_dict() for link in links], + "links": [link.to_dict(transform_href=transform_hrefs) for link in links], } if self.stac_extensions is not None: diff --git a/pystac/collection.py b/pystac/collection.py index f6f44b621..29fea8453 100644 --- a/pystac/collection.py +++ b/pystac/collection.py @@ -515,8 +515,12 @@ def add_item( super().add_item(item, title, strategy) item.set_collection(self) - def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: - d = super().to_dict(include_self_link) + def to_dict( + self, include_self_link: bool = True, transform_hrefs: bool = True + ) -> Dict[str, Any]: + d = super().to_dict( + include_self_link=include_self_link, transform_hrefs=transform_hrefs + ) d["extent"] = self.extent.to_dict() d["license"] = self.license if self.stac_extensions is not None: diff --git a/pystac/item.py b/pystac/item.py index 745aa3f09..14c660e17 100644 --- a/pystac/item.py +++ b/pystac/item.py @@ -285,7 +285,9 @@ def get_collection(self) -> Optional[Collection]: else: return cast(Collection, collection_link.resolve_stac_object().target) - def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: + def to_dict( + self, include_self_link: bool = True, transform_hrefs: bool = True + ) -> Dict[str, Any]: links = self.links if not include_self_link: links = [x for x in links if x.rel != pystac.RelType.SELF] @@ -303,7 +305,7 @@ def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: "id": self.id, "properties": self.properties, "geometry": self.geometry, - "links": [link.to_dict() for link in links], + "links": [link.to_dict(transform_href=transform_hrefs) for link in links], "assets": assets, } diff --git a/pystac/item_collection.py b/pystac/item_collection.py index 8f1a6bf7f..de38ac34f 100644 --- a/pystac/item_collection.py +++ b/pystac/item_collection.py @@ -116,11 +116,23 @@ def __add__(self, other: object) -> "ItemCollection": return ItemCollection(items=combined, clone_items=False) - def to_dict(self) -> Dict[str, Any]: - """Serializes an :class:`ItemCollection` instance to a JSON-like dictionary.""" + def to_dict(self, transform_hrefs: bool = False) -> Dict[str, Any]: + """Serializes an :class:`ItemCollection` instance to a JSON-like dictionary. + + Args: + transform_hrefs: If True, transform the HREF of hierarchical links + of Items based on the type of catalog the Item belongs to (if any). + I.e. if the item belongs to a root catalog that is + RELATIVE_PUBLISHED or SELF_CONTAINED, + hierarchical link HREFs will be transformed to be relative to the + catalog root. This can be slow if the Items have root links that + have not yet been resolved. Defaults to False. + """ return { "type": "FeatureCollection", - "features": [item.to_dict() for item in self.items], + "features": [ + item.to_dict(transform_hrefs=transform_hrefs) for item in self.items + ], **self.extra_fields, } diff --git a/pystac/link.py b/pystac/link.py index 7de266bbf..a6f8e9431 100644 --- a/pystac/link.py +++ b/pystac/link.py @@ -129,9 +129,16 @@ def href(self) -> str: raise ValueError(f"{self} does not have an HREF set.") return result - def get_href(self) -> Optional[str]: + def get_href(self, transform_href: bool = True) -> Optional[str]: """Gets the HREF for this link. + Args: + transform_href: If True, transform the HREF based on the type of + catalog the owner belongs to (if any). I.e. if the link owner + belongs to a root catalog that is RELATIVE_PUBLISHED or SELF_CONTAINED, + the HREF will be transformed to be relative to the catalog root + if this is a hierarchical link relation. + Returns: str: Returns this link's HREF. If there is an owner of the link and the root catalog (if there is one) is of type RELATIVE_PUBLISHED, @@ -144,7 +151,13 @@ def get_href(self) -> Optional[str]: else: href = self._target_href - if href and is_absolute_href(href) and self.owner and self.owner.get_root(): + if ( + transform_href + and href + and is_absolute_href(href) + and self.owner + and self.owner.get_root() + ): root = self.owner.get_root() rel_links = [ *HIERARCHICAL_LINKS, @@ -304,14 +317,22 @@ def is_resolved(self) -> bool: """ return self._target_object is not None - def to_dict(self) -> Dict[str, Any]: + def to_dict(self, transform_href: bool = True) -> Dict[str, Any]: """Generate a dictionary representing the JSON of this serialized Link. Returns: dict: A serialization of the Link that can be written out as JSON. + transform_href: If True, transform the HREF based on the type of + catalog the owner belongs to (if any). I.e. if the link owner + belongs to a root catalog that is RELATIVE_PUBLISHED or SELF_CONTAINED, + the HREF will be transformed to be relative to the catalog root + if this is a hierarchical link relation. """ - d: Dict[str, Any] = {"rel": self.rel, "href": self.get_href()} + d: Dict[str, Any] = { + "rel": self.rel, + "href": self.get_href(transform_href=transform_href), + } if self.media_type is not None: d["type"] = self.media_type diff --git a/pystac/stac_object.py b/pystac/stac_object.py index b1e4fa7d0..d6871d2d5 100644 --- a/pystac/stac_object.py +++ b/pystac/stac_object.py @@ -428,12 +428,20 @@ def _object_links(self) -> List[str]: raise NotImplementedError @abstractmethod - def to_dict(self, include_self_link: bool = True) -> Dict[str, Any]: + def to_dict( + self, include_self_link: bool = True, transform_hrefs: bool = True + ) -> Dict[str, Any]: """Generate a dictionary representing the JSON of this serialized object. Args: include_self_link : If True, the dict will contain a self link to this object. If False, the self link will be omitted. + transform_hrefs: If True, transform the HREF of hierarchical links + based on the type of catalog this object belongs to (if any). + I.e. if this object belongs to a root catalog that is + RELATIVE_PUBLISHED or SELF_CONTAINED, + hierarchical link HREFs will be transformed to be relative to the + catalog root. dict: A serialization of the object that can be written out as JSON. """ From 98b5c89e7c279a9ff8f8a2c98b487020d5ce4f7b Mon Sep 17 00:00:00 2001 From: Rob Emanuele Date: Thu, 11 Nov 2021 22:25:45 -0500 Subject: [PATCH 6/6] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48faf3592..3745e3ed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Added Table-extension ([#646](https://github.com/stac-utils/pystac/pull/646)) - Stable support for Python 3.10 ([#656](https://github.com/stac-utils/pystac/pull/656)) - `.python-version` files are now ignored by Git ([#647](https://github.com/stac-utils/pystac/pull/647)) +- Added a flag to allow users to skip transforming hierarchical link HREFs based on root catalog type ([#663](https://github.com/stac-utils/pystac/pull/663)) ### Removed @@ -20,6 +21,7 @@ - Links will get their `title` from their target if no `title` is provided ([#607](https://github.com/stac-utils/pystac/pull/607)) - Relax typing on `LabelClasses` from `List` to `Sequence` ([#627](https://github.com/stac-utils/pystac/pull/627)) - Upgraded datacube-extension to version 2.0.0 ([#645](https://github.com/stac-utils/pystac/pull/645)) +- By default, ItemCollections will not modify Item HREFs based on root catalog type to avoid performance costs of root link reads ([#663](https://github.com/stac-utils/pystac/pull/663)) ### Fixed