Add performance benchmarks (#748)

duckontheweb · gadomski · web-flow · commit 2aaa16207d92 · 2023-01-11T16:15:30.000-07:00
* Configure asv and add import benchmark

* Add item (de)serialization benchmarks &amp; use 10 reps

* Add catalog, collection benchmarks and tweak settings

* Add convenience script for running locally

* Use default Python

* Add benchmark workflow to CI

* Match label condition to label name in repo

* Fix lint errors

* Add virtualenv to benchmark deps

* Fix artifact name, increase failure threshold

* rm: benchmarks workflow

I'm of the opinion that we _shouldn't_ run benchmarks on Github runners, so I'm
removing this workflow.

* refactor: use classes directly

* refactor: move benchmarks up a level

This lets simple command like `asv dev` work out of the box.

* feat: add projection benchmarks

I'm not really sure how useful this is, but it was asked for so at least we have
something.

* feat: add large catalog benchmarks

* fix: benchmark config

* feat: add benchmark docs

* ci: add benchmark check

This doesn't run benchmarks, but just checks to make sure they build.

* ci: set the asv machine

* ci: install pystac for benchmarks

* docs: add more text about running benchmarks

* bench: use timeraw for import

Co-authored-by: Pete Gadomski &lt;pete.gadomski@gmail.com&gt;
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -158,3 +158,24 @@ jobs:
 
       - name: Install dev dependencies
         run: pip install -r requirements-dev.txt
+
+  check-benchmarks:
+    # This checks to make sure any API changes haven't broken any of the
+    # benchmarks. It doesn't do any actual benchmarking, since (IMO) that's not
+    # appropriate for CI on Github actions.
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+          cache-dependency-path: requirements-bench.txt
+      - name: Install pystac
+        run: pip install .
+      - name: Install benchmark dependencies
+        run: pip install -r requirements-bench.txt
+      - name: Set asv machine
+        run: asv machine --yes
+      - name: Check benchmarks
+        run: asv dev -a repeat=1 -a rounds=1 --strict
diff --git a/.gitignore b/.gitignore
@@ -157,3 +157,6 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+# asv environments
+.asv
diff --git a/asv.conf.json b/asv.conf.json
@@ -0,0 +1,24 @@
+{
+    "version": 1,
+    "project": "pystac",
+    "project_url": "https://pystac.readthedocs.io/",
+    "repo": ".",
+    "branches": [
+        "main"
+    ],
+    "dvcs": "git",
+    "environment_type": "virtualenv",
+    "show_commit_url": "http://github.com/stac-utils/pystac/commit/",
+    "matrix": {
+        "req": {
+            "orjson": [
+                null,
+                ""
+            ]
+        }
+    },
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html"
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/_base.py b/benchmarks/_base.py
@@ -0,0 +1,6 @@
+class Bench:
+    # Repeat between 10-50 times up to a max time of 5s
+    repeat = (10, 50, 2.0)
+
+    # Bump number of rounds to 4
+    rounds = 4
diff --git a/benchmarks/_util.py b/benchmarks/_util.py
@@ -0,0 +1,16 @@
+import os
+from typing import Union, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    PathLike = os.PathLike[str]
+else:
+    PathLike = os.PathLike
+
+
+def get_data_path(rel_path: Union[str, PathLike]) -> str:
+    """Gets the absolute path to a file based on a path relative to the
+    tests/data-files directory in this repo."""
+    rel_path = os.fspath(rel_path)
+    return os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "tests", "data-files", rel_path)
+    )
diff --git a/benchmarks/catalog.py b/benchmarks/catalog.py
@@ -0,0 +1,118 @@
+import datetime
+import json
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from pystac import (
+    Catalog,
+    StacIO,
+    Collection,
+    Extent,
+    TemporalExtent,
+    SpatialExtent,
+    Item,
+)
+
+from ._base import Bench
+from ._util import get_data_path
+
+
+class CatalogBench(Bench):
+    def setup(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+        self.stac_io = StacIO.default()
+
+        self.catalog_path = get_data_path("examples/1.0.0/catalog.json")
+        with open(self.catalog_path) as src:
+            self.catalog_dict = json.load(src)
+        self.catalog = Catalog.from_file(self.catalog_path)
+
+    def teardown(self) -> None:
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def time_catalog_from_file(self) -> None:
+        """Deserialize an Item from file"""
+        _ = Catalog.from_file(self.catalog_path)
+
+    def time_catalog_from_dict(self) -> None:
+        """Deserialize an Item from dictionary."""
+        _ = Catalog.from_dict(self.catalog_dict)
+
+    def time_catalog_to_dict(self) -> None:
+        """Serialize an Item to a dictionary."""
+        self.catalog.to_dict(include_self_link=True)
+
+    def time_catalog_save(self) -> None:
+        """Serialize an Item to a JSON file."""
+        self.catalog.save_object(
+            include_self_link=True,
+            dest_href=os.path.join(self.temp_dir, "time_catalog_save.json"),
+            stac_io=self.stac_io,
+        )
+
+
+class WalkCatalogBench(Bench):
+    def setup_cache(self) -> Catalog:
+        return make_large_catalog()
+
+    def time_walk(self, catalog: Catalog) -> None:
+        for (
+            _,
+            _,
+            _,
+        ) in catalog.walk():
+            pass
+
+    def peakmem_walk(self, catalog: Catalog) -> None:
+        for (
+            _,
+            _,
+            _,
+        ) in catalog.walk():
+            pass
+
+
+class ReadCatalogBench(Bench):
+    def setup(self) -> None:
+        catalog = make_large_catalog()
+        self.temporary_directory = TemporaryDirectory()
+        self.path = str(Path(self.temporary_directory.name) / "catalog.json")
+        catalog.normalize_and_save(self.temporary_directory.name)
+
+    def teardown(self) -> None:
+        shutil.rmtree(self.temporary_directory.name)
+
+    def time_read_and_walk(self) -> None:
+        catalog = Catalog.from_file(self.path)
+        for _, _, _ in catalog.walk():
+            pass
+
+
+class WriteCatalogBench(Bench):
+    def setup(self) -> None:
+        self.catalog = make_large_catalog()
+        self.temporary_directory = TemporaryDirectory()
+
+    def teardown(self) -> None:
+        shutil.rmtree(self.temporary_directory.name)
+
+    def time_normalize_and_save(self) -> None:
+        self.catalog.normalize_and_save(self.temporary_directory.name)
+
+
+def make_large_catalog() -> Catalog:
+    catalog = Catalog("an-id", "a description")
+    extent = Extent(
+        SpatialExtent([[-180.0, -90.0, 180.0, 90.0]]),
+        TemporalExtent([[datetime.datetime(2023, 1, 1), None]]),
+    )
+    for i in range(0, 10):
+        collection = Collection(f"collection-{i}", f"Collection {i}", extent)
+        for j in range(0, 100):
+            item = Item(f"item-{i}-{j}", None, None, datetime.datetime.now(), {})
+            collection.add_item(item)
+        catalog.add_child(collection)
+    return catalog
diff --git a/benchmarks/collection.py b/benchmarks/collection.py
@@ -0,0 +1,43 @@
+import json
+import os
+import shutil
+import tempfile
+from pystac import StacIO, Collection
+
+from ._base import Bench
+from ._util import get_data_path
+
+
+class CollectionBench(Bench):
+    def setup(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+        self.stac_io = StacIO.default()
+
+        self.collection_path = get_data_path("examples/1.0.0/collection.json")
+        with open(self.collection_path) as src:
+            self.collection_dict = json.load(src)
+        self.collection = Collection.from_file(self.collection_path)
+
+    def teardown(self) -> None:
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def time_collection_from_file(self) -> None:
+        """Deserialize an Item from file"""
+        _ = Collection.from_file(self.collection_path)
+
+    def time_collection_from_dict(self) -> None:
+        """Deserialize an Item from dictionary."""
+        _ = Collection.from_dict(self.collection_dict)
+
+    def time_collection_to_dict(self) -> None:
+        """Serialize an Item to a dictionary."""
+        self.collection.to_dict(include_self_link=True)
+
+    def time_collection_save(self) -> None:
+        """Serialize an Item to a JSON file."""
+        self.collection.save_object(
+            include_self_link=True,
+            dest_href=os.path.join(self.temp_dir, "time_collection_save.json"),
+            stac_io=self.stac_io,
+        )
diff --git a/benchmarks/extensions/__init__.py b/benchmarks/extensions/__init__.py
diff --git a/benchmarks/extensions/projection.py b/benchmarks/extensions/projection.py
@@ -0,0 +1,14 @@
+import datetime
+
+from pystac import Item
+from pystac.extensions.projection import ProjectionExtension
+
+from .._base import Bench
+
+
+class ProjectionBench(Bench):
+    def setup(self) -> None:
+        self.item = Item("an-id", None, None, datetime.datetime.now(), {})
+
+    def time_add_projection_extension(self) -> None:
+        _ = ProjectionExtension.ext(self.item, add_if_missing=True)
diff --git a/benchmarks/import_pystac.py b/benchmarks/import_pystac.py
@@ -0,0 +1,7 @@
+class ImportPySTACBench:
+    repeat = 10
+
+    def timeraw_import_pystac(self) -> str:
+        return """
+        import pystac
+        """
diff --git a/benchmarks/item.py b/benchmarks/item.py
@@ -0,0 +1,43 @@
+import json
+import os
+import shutil
+import tempfile
+from pystac import StacIO, Item
+
+from ._base import Bench
+from ._util import get_data_path
+
+
+class ItemBench(Bench):
+    def setup(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+        self.stac_io = StacIO.default()
+
+        self.item_path = get_data_path("item/sample-item-asset-properties.json")
+        with open(self.item_path) as src:
+            self.item_dict = json.load(src)
+        self.item = Item.from_file(self.item_path)
+
+    def teardown(self) -> None:
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def time_item_from_file(self) -> None:
+        """Deserialize an Item from file"""
+        _ = Item.from_file(self.item_path)
+
+    def time_item_from_dict(self) -> None:
+        """Deserialize an Item from dictionary."""
+        _ = Item.from_dict(self.item_dict)
+
+    def time_item_to_dict(self) -> None:
+        """Serialize an Item to a dictionary."""
+        self.item.to_dict(include_self_link=True)
+
+    def time_item_save(self) -> None:
+        """Serialize an Item to a JSON file."""
+        self.item.save_object(
+            include_self_link=True,
+            dest_href=os.path.join(self.temp_dir, "time_item_save.json"),
+            stac_io=self.stac_io,
+        )
diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -73,6 +73,45 @@ flag to Git commit commands, as in ``git commit --no-verify``.
 .. [#] In rare cases changes to one file might invalidate an unchanged file, such as
    when modifying the return type of a function used in another file.
 
+Benchmarks
+^^^^^^^^^^
+
+PySTAC uses `asv <https://asv.readthedocs.io>`_ for benchmarking. Benchmarks are
+defined in the ``./benchmarks`` directory. Due to the inherent uncertainty in
+the environment of Github workflow runners, benchmarks are not executed in CI.
+If your changes may affect performance, use the provided script to run the
+benchmark suite locally. This script will compare your current ``HEAD`` with
+the **main** branch and report any improvements or regressions.
+
+.. code-block:: bash
+
+    scripts/bench
+
+The benchmark suite takes a while to run, and will report any significant
+changes to standard output. For example, here's a benchmark comparison between
+v1.0.0 and v1.6.1 (from `@gadomski's <https://github.com/gadomski>`_ computer)::
+
+          before           after         ratio
+        [eee06027]       [579c071b]
+        <v1.0.0^0>       <v1.6.1^0>
+    -        533±20μs         416±10μs     0.78  collection.CollectionBench.time_collection_from_file [gadomski/virtualenv-py3.10-orjson]
+    -         329±8μs         235±10μs     0.72  collection.CollectionBench.time_collection_from_dict [gadomski/virtualenv-py3.10-orjson]
+    -        332±10μs          231±4μs     0.70  collection.CollectionBench.time_collection_from_dict [gadomski/virtualenv-py3.10]
+    -         174±4μs          106±2μs     0.61  item.ItemBench.time_item_from_dict [gadomski/virtualenv-py3.10]
+    -         174±4μs          106±2μs     0.61  item.ItemBench.time_item_from_dict [gadomski/virtualenv-py3.10-orjson]
+        before           after         ratio
+        [eee06027]       [579c071b]
+        <v1.0.0^0>       <v1.6.1^0>
+    +        87.1±3μs          124±5μs     1.42  catalog.CatalogBench.time_catalog_from_dict [gadomski/virtualenv-py3.10]
+    +        87.1±4μs          122±5μs     1.40  catalog.CatalogBench.time_catalog_from_dict [gadomski/virtualenv-py3.10-orjson]
+
+When developing new benchmarks, you can run a shortened version of the benchmark suite:
+
+.. code-block:: bash
+
+    asv dev
+
+
 CHANGELOG
 ^^^^^^^^^
 
diff --git a/requirements-bench.txt b/requirements-bench.txt
@@ -0,0 +1,2 @@
+asv==0.5.1
+virtualenv==20.13.1
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,5 @@
 -r ./requirements-docs.txt
 -r ./requirements-test.txt
+-r ./requirements-bench.txt
 
 jupyter==1.0.0
diff --git a/scripts/bench b/scripts/bench
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -e
+
+if [[ -z $ASV_FACTOR ]]; then
+    ASV_FACTOR=1.25;
+fi
+
+asv continuous --split -e --interleave-rounds \
+    --factor ${ASV_FACTOR} \
+    main HEAD;