Reducing import time from ~2.7s to ~0.6s

mtauraso · mtauraso · commit c8c1d8c6eaed · 2025-05-12T12:35:23.000-07:00
- Many top of file imports moved to functions
- Some type annotations removed
- Added benchmark test to measure our import time.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -50,7 +50,7 @@ repos:
       - id: xcxc-check
         name: Check for note-to-self comments (xcxc)
         description: Grep all source files for xcxc which signifies a comment that shouldn't be checked in.
-        entry: bash -c "[[ $(grep -rniI xcxc --exclude .pre-commit-config.yaml --exclude-dir _readthedocs --exclude-dir htmlcov ./* >&2 ; echo $?) == 1 ]]"
+        entry: bash -c "[[ $(grep -rniI xcxc --exclude .pre-commit-config.yaml --exclude-dir _readthedocs --exclude-dir htmlcov  --exclude-dir _results --exclude-dir env ./* >&2 ; echo $?) == 1 ]]"
         language: system
         pass_filenames: false
         always_run: true
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -3,6 +3,8 @@
 For more information on writing benchmarks:
 https://asv.readthedocs.io/en/stable/writing_benchmarks.html."""
 
+import subprocess
+
 from hyrax import example_benchmarks
 
 
@@ -14,3 +16,14 @@ def time_computation():
 def mem_list():
     """Memory computations are prefixed with 'mem' or 'peakmem'."""
     return example_benchmarks.memory_computation()
+
+
+def time_import():
+    """
+    time how long it takes to import our package. This should stay relatively fast.
+
+    Note, the actual import time will be slightly lower than this on a comparable system
+    However, high import times do affect this metric proportionally.
+    """
+    result = subprocess.run(["python", "-c", "import hyrax"])
+    assert result.returncode == 0
diff --git a/src/hyrax/data_sets/data_set_registry.py b/src/hyrax/data_sets/data_set_registry.py
@@ -1,11 +1,8 @@
 # ruff: noqa: D102, B027
 import logging
 from collections.abc import Generator
-from typing import Optional
 
 import numpy.typing as npt
-from astropy.table import Table
-from torch.utils.data import Dataset, IterableDataset
 
 from hyrax.config_utils import ConfigDict
 from hyrax.plugin_utils import get_or_load_class, update_registry
@@ -49,7 +46,7 @@ def __len__ ():
 
     """
 
-    def __init__(self, config: ConfigDict, metadata_table: Optional[Table] = None):
+    def __init__(self, config: ConfigDict, metadata_table=None):
         """
         .. py:method:: __init__
 
@@ -117,6 +114,8 @@ def is_iterable(self):
         bool
             True if underlying dataset is iterable
         """
+        from torch.utils.data import Dataset, IterableDataset
+
         if isinstance(self, (Dataset, IterableDataset)):
             return isinstance(self, IterableDataset)
         else:
@@ -132,6 +131,8 @@ def is_map(self):
         bool
             True if underlying dataset is map-style
         """
+        from torch.utils.data import Dataset, IterableDataset
+
         if isinstance(self, (Dataset, IterableDataset)):
             # All torch IterableDatasets are also Datasets
             return not isinstance(self, IterableDataset)
diff --git a/src/hyrax/data_sets/fits_image_dataset.py b/src/hyrax/data_sets/fits_image_dataset.py
@@ -55,19 +55,15 @@
 
 import logging
 import time
-from collections.abc import Generator, Iterable, Iterator
+from collections.abc import Generator, Iterable
 from concurrent.futures import Executor
 from pathlib import Path
 from threading import Thread
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import numpy.typing as npt
-from astropy.io import fits
-from astropy.table import Table
-from torch import Tensor, from_numpy
 from torch.utils.data import Dataset
-from torchvision.transforms.v2 import CenterCrop, Compose, Lambda, Transform
 
 from hyrax.config_utils import ConfigDict
 
@@ -99,6 +95,8 @@ def __init__(self, config: ConfigDict):
         config : ConfigDict
             Nested configuration dictionary for hyrax
         """
+        from torchvision.transforms.v2 import Lambda
+
         self._config = config
 
         transform_str = config["data_set"]["transform"]
@@ -155,6 +153,9 @@ def _init_from_path(self, path: Union[Path, str]):
             Path or string specifying the directory path that is the root of all filenames in the
             catalog table
         """
+        from torch import Tensor
+        from torchvision.transforms.v2 import Compose
+
         self.path = path
 
         # This is common code
@@ -186,7 +187,7 @@ def _init_from_path(self, path: Union[Path, str]):
 
         logger.info(f"FitsImageDataSet has {len(self)} objects")
 
-    def _set_crop_transform(self) -> Transform:
+    def _set_crop_transform(self):
         """
         Returns the crop transform on the image
 
@@ -196,6 +197,8 @@ def _set_crop_transform(self) -> Transform:
 
         2) Return the crop transform only so it can be added to the transform stack appropriately.
         """
+        from torchvision.transforms.v2 import CenterCrop
+
         self.cutout_shape = self.config["data_set"]["crop_to"] if self.config["data_set"]["crop_to"] else None
 
         if not isinstance(self.cutout_shape, list) or len(self.cutout_shape) != 2:
@@ -205,7 +208,9 @@ def _set_crop_transform(self) -> Transform:
 
         return CenterCrop(size=self.cutout_shape)
 
-    def _read_filter_catalog(self, filter_catalog_path: Optional[Path]) -> Optional[Table]:
+    def _read_filter_catalog(self, filter_catalog_path: Optional[Path]):
+        from astropy.table import Table
+
         if filter_catalog_path is None:
             msg = "Must provide a filter catalog in config['data_set']['filter_catalog']"
             raise RuntimeError(msg)
@@ -250,7 +255,7 @@ def _read_filter_catalog(self, filter_catalog_path: Optional[Path]) -> Optional[
 
         return table
 
-    def _parse_filter_catalog(self, table: Optional[Table]) -> None:
+    def _parse_filter_catalog(self, table) -> None:
         """Sets self.files by parsing the catalog.
 
         Subclasses may override this function to control parsing of the table more directly, but the
@@ -305,7 +310,7 @@ def _before_preload(self) -> None:
         # fetching
         pass
 
-    def _prepare_metadata(self) -> Optional[Table]:
+    def _prepare_metadata(self):
         # This happens when filter_catalog_table is injected in unit tests
         if FitsImageDataSet._called_from_test:
             return None
@@ -366,7 +371,7 @@ def __len__(self) -> int:
         """
         return len(self.files)
 
-    def __getitem__(self, idx: int) -> Tensor:
+    def __getitem__(self, idx: int):
         if idx >= len(self.files) or idx < 0:
             raise IndexError
 
@@ -528,7 +533,7 @@ def _preload_tensor_cache(self):
                     self._log_duration_tensorboard("preload_1k_obj_s", start_time)
                     start_time = time.monotonic_ns()
 
-    def _lazy_map_executor(self, executor: Executor, ids: Iterable[str]) -> Iterator[Tensor]:
+    def _lazy_map_executor(self, executor: Executor, ids: Iterable[str]):
         """This is a version of concurrent.futures.Executor map() which lazily evaluates the iterator passed
         We do this because we do not want all of the tensors to remain in memory during pre-loading. We would
         prefer a smaller set of in-flight tensors.
@@ -554,9 +559,10 @@ def _lazy_map_executor(self, executor: Executor, ids: Iterable[str]) -> Iterator
         Iterator[torch.Tensor]
             An iterator over torch tensors, lazily loaded by running the work_fn as needed.
         """
-
         from concurrent.futures import FIRST_COMPLETED, Future, wait
 
+        from torch import Tensor
+
         max_futures = FitsImageDataSet._determine_numprocs_preload()
         queue: list[Future[Tensor]] = []
         in_progress: set[Future[Tensor]] = set()
@@ -609,15 +615,17 @@ def _log_duration_tensorboard(self, name: str, start_time: int):
             duration_s = (now - start_time) / 1.0e9
             self.tensorboardx_logger.add_scalar(name, duration_s, since_tensorboard_start_us)
 
-    def _check_object_id_to_tensor_cache(self, object_id: str) -> Optional[Tensor]:
+    def _check_object_id_to_tensor_cache(self, object_id: str):
         return self.tensors.get(object_id, None)
 
-    def _populate_object_id_to_tensor_cache(self, object_id: str) -> Tensor:
+    def _populate_object_id_to_tensor_cache(self, object_id: str):
         data_torch = self._read_object_id(object_id)
         self.tensors[object_id] = data_torch
         return data_torch
 
-    def _read_object_id(self, object_id: str) -> Tensor:
+    def _read_object_id(self, object_id: str):
+        from astropy.io import fits
+
         start_time = time.monotonic_ns()
 
         # Read all the files corresponding to this object
@@ -635,7 +643,9 @@ def _read_object_id(self, object_id: str) -> Tensor:
         self._log_duration_tensorboard("object_total_read_time_s", start_time)
         return data_torch
 
-    def _convert_to_torch(self, data: list[npt.ArrayLike]) -> Tensor:
+    def _convert_to_torch(self, data: list[npt.ArrayLike]):
+        from torch import from_numpy
+
         start_time = time.monotonic_ns()
 
         # Push all the filter data into a tensor object
@@ -655,7 +665,7 @@ def _convert_to_torch(self, data: list[npt.ArrayLike]) -> Tensor:
     # Do we want to memoize them on first __getitem__ call?
     #
     # For now we just do it the naive way
-    def _object_id_to_tensor(self, object_id: str) -> Tensor:
+    def _object_id_to_tensor(self, object_id: str):
         """Converts an object_id to a pytorch tensor with dimenstions (self.num_filters, self.cutout_shape[0],
         self.cutout_shape[1]). This is done by reading the file and slicing away any excess pixels at the
         far corners of the image from (0,0).
diff --git a/src/hyrax/data_sets/hsc_data_set.py b/src/hyrax/data_sets/hsc_data_set.py
@@ -10,22 +10,9 @@
 from typing import Optional
 
 import numpy as np
-from astropy.io import fits
-from astropy.table import Table
 from schwimmbad import MultiPool
-from torchvision.transforms.v2 import CenterCrop
 
 from hyrax.config_utils import ConfigDict
-from hyrax.download import Downloader
-from hyrax.downloadCutout.downloadCutout import (
-    parse_bool,
-    parse_degree,
-    parse_latitude,
-    parse_longitude,
-    parse_rerun,
-    parse_tract_opt,
-    parse_type,
-)
 
 from .fits_image_dataset import FitsImageDataSet, files_dict
 
@@ -43,6 +30,8 @@ def __init__(self, config: ConfigDict):
         .. py:method:: __init__
 
         """
+        from hyrax.download import Downloader
+
         # Note "rebuild_manifest" is not a config, its a hack for rebuild_manifest mode
         # to ensure we don't use the manifest we believe is corrupt.
         rebuild_manifest = config["rebuild_manifest"] if "rebuild_manifest" in config else False  # noqa: SIM401
@@ -61,7 +50,9 @@ def __init__(self, config: ConfigDict):
 
         super().__init__(config)
 
-    def _read_filter_catalog(self, filter_catalog_path: Optional[Path]) -> Optional[Table]:
+    def _read_filter_catalog(self, filter_catalog_path: Optional[Path]):
+        from astropy.table import Table
+
         try:
             retval = super()._read_filter_catalog(filter_catalog_path)
         except RuntimeError:
@@ -84,7 +75,7 @@ def _read_filter_catalog(self, filter_catalog_path: Optional[Path]) -> Optional[
     #
     # In the HSC case this will also have to do fallback and call
     # _scan_file_dimensions() and/or _scan_file_names() and pass back only the files dict.
-    def _parse_filter_catalog(self, table: Table) -> None:
+    def _parse_filter_catalog(self, table) -> None:
         object_id_missing = self.object_id_column_name not in table.colnames if table is not None else True
         filter_missing = self.filter_column_name not in table.colnames if table is not None else True
         filename_missing = self.filename_column_name not in table.colnames if table is not None else True
@@ -137,6 +128,8 @@ def _parse_filter_catalog(self, table: Table) -> None:
         return self.files
 
     def _set_crop_transform(self):
+        from torchvision.transforms.v2 import CenterCrop
+
         cutout_shape = self.config["data_set"]["crop_to"] if self.config["data_set"]["crop_to"] else None
         self.cutout_shape = self._check_file_dimensions() if cutout_shape is None else cutout_shape
         return CenterCrop(size=self.cutout_shape)
@@ -285,6 +278,8 @@ def _scan_file_dimension(processing_unit: tuple[str, list[str]]) -> tuple[str, l
 
     @staticmethod
     def _fits_file_dims(filepath) -> tuple[int, int]:
+        from astropy.io import fits
+
         try:
             with fits.open(filepath) as hdul:
                 return (hdul[1].shape[0], hdul[1].shape[1])
@@ -439,6 +434,19 @@ def _check_file_dimensions(self) -> tuple[int, int]:
         return cutout_width, cutout_height
 
     def _rebuild_manifest(self, config):
+        from astropy.table import Table
+
+        from hyrax.download import Downloader
+        from hyrax.downloadCutout.downloadCutout import (
+            parse_bool,
+            parse_degree,
+            parse_latitude,
+            parse_longitude,
+            parse_rerun,
+            parse_tract_opt,
+            parse_type,
+        )
+
         if self.filter_catalog:
             raise RuntimeError("Cannot rebuild manifest. Set the filter_catalog=false and rerun")
 
diff --git a/src/hyrax/data_sets/hyrax_cifar_data_set.py b/src/hyrax/data_sets/hyrax_cifar_data_set.py
@@ -2,10 +2,7 @@
 import logging
 
 import numpy as np
-import torchvision.transforms as transforms
-from astropy.table import Table
 from torch.utils.data import Dataset, IterableDataset
-from torchvision.datasets import CIFAR10
 
 from hyrax.config_utils import ConfigDict
 
@@ -18,6 +15,10 @@ class HyraxCifarBase:
     """Base class for Hyrax Cifar datasets"""
 
     def __init__(self, config: ConfigDict):
+        import torchvision.transforms as transforms
+        from astropy.table import Table
+        from torchvision.datasets import CIFAR10
+
         transform = transforms.Compose(
             [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
         )
diff --git a/src/hyrax/data_sets/inference_dataset.py b/src/hyrax/data_sets/inference_dataset.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import numpy.typing as npt
-from torch import Tensor, from_numpy
 from torch.utils.data import Dataset
 
 from hyrax.config_utils import find_most_recent_results_dir
@@ -98,7 +97,9 @@ def ids(self) -> Generator[str]:
         """
         return (str(id) for id in self.batch_index["id"])
 
-    def __getitem__(self, idx: Union[int, np.ndarray]) -> Tensor:
+    def __getitem__(self, idx: Union[int, np.ndarray]):
+        from torch import from_numpy
+
         try:
             _ = (e for e in idx)  # type: ignore[union-attr]
         except TypeError:
diff --git a/src/hyrax/verbs/infer.py b/src/hyrax/verbs/infer.py
diff --git a/src/hyrax/verbs/visualize.py b/src/hyrax/verbs/visualize.py