Iterable dataset support (#273)

mtauraso · web-flow · commit d5a8d7fdf622 · 2025-04-17T13:55:24.000-07:00
- Adds an example CIFAR iterable dataset - Changes to pytorch_ignite.py to work around an ignite bug in iterable datasets pytorch/ignite#3372 - New is_iterable() and is_map() interface on dataset base class to unify discernment logic - Support for abstract base classes that derive from HyraxDataset not being themselves checked for required methods. - Documentation added for iterable external data sets - End to end tests for iterable datasets
diff --git a/docs/external_libraries.rst b/docs/external_libraries.rst
@@ -75,13 +75,19 @@ items in the batch. This loss is logged to MLflow and tensorboard.
 Defining a dataset class
 ------------------------
 
-Dataset classes are written as subclasses of both ``hyrax.data_sets.HyraxDataset`` and 
-``torch.utils.data.Dataset``. Datasets must minimally define the methods below. These are similar in form to 
-Torch's `Map-style datasets <https://pytorch.org/docs/stable/data.html#map-style-datasets>`_
+Dataset classes are written as subclasses of ``hyrax.data_sets.HyraxDataset``. Datasets must choose to be 
+either "map style", and also inherit from ``torch.utils.data.Dataset`` or "iterable" and inherit from 
+``torch.utils.data.IterableDataset``. `Look here <https://pytorch.org/docs/stable/data.html#dataset-types>`_ 
+for an overview of the difference between map style and iterable datasets.
 
-A fully worked example of creating a custom dataset class is in the example notebook 
+A fully worked example of creating a custom map-style dataset class is in the example notebook 
 :doc:`/pre_executed/custom_dataset`
 
+The methods required are detailed by category below.
+
+All datasets
+............
+
 ``__init__(self, config)``
 .................................
 On creation of your dataset Hyrax passes the entire Hyrax config as a nested dictionry in the ``config`` 
@@ -92,6 +98,9 @@ dataset will be done by Hyrax, when running the relevant verb. Further detail on
 You must call ``super().__init__(config)`` or ``super().__init__(config, metadata_table)`` in your 
 ``__init__`` function
 
+Map style datasets
+..................
+
 ``__getitem(self, idx:int)``
 ............................
 Return a single item in your dataset given a zero-based index.
@@ -100,6 +109,16 @@ Return a single item in your dataset given a zero-based index.
 .................
 Return the length of your dataset.
 
+Iterable datasets
+.................
+
+``__iter__(self)``
+.................
+Yield a single item in your dataset, or supply a generator function which does the same.
+If your dataset has an end, yield StopIteration at the end.
+
+Warning: Iterable datasets which do not yield StopIteration are not currently supported in hyrax.
+
 Optional Overrides
 ..................
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,12 @@ dynamic = ["version"]
 requires-python = ">=3.9"
 dependencies = [
     "astropy", # Used to load fits files of sources to query HSC cutout server
-    "pytorch-ignite", # Used for distributed training, logging, etc.
+    # Pin to the current version of pytorch ignite so workarounds to 
+    # https://github.com/pytorch/ignite/issues/3372 function correctly
+    # while allowing us to release packages that don't depend on dev versions
+    # of pytorch-ignite.
+    "pytorch-ignite <= 0.5.2", # Used for distributed training, logging, etc.
+    "more-itertools", # Used to work around the issue in pytorch-ignite above
     "toml", # Used to load configuration files as dictionaries
     "tomlkit", # Used to load configuration files as dictionaries and retain comments
     "torch", # Used for CNN model and in train.py
diff --git a/src/hyrax/data_sets/__init__.py b/src/hyrax/data_sets/__init__.py
@@ -8,6 +8,7 @@
     "DATA_SET_REGISTRY",
     "HyraxCifarDataSet",
     "FitsImageDataSet",
+    "HyraxCifarIterableDataSet",
     "HSCDataSet",
     "InferenceDataSet",
     "Dataset",
diff --git a/src/hyrax/data_sets/data_set_registry.py b/src/hyrax/data_sets/data_set_registry.py
@@ -5,6 +5,7 @@
 
 import numpy.typing as npt
 from astropy.table import Table
+from torch.utils.data import Dataset, IterableDataset
 
 from hyrax.config_utils import ConfigDict
 from hyrax.plugin_utils import get_or_load_class, update_registry
@@ -86,15 +87,46 @@ def __init__(config):
         self._metadata_table = metadata_table
         self.tensorboardx_logger = None
 
+    def is_iterable(self):
+        """
+        Returns true if underlying dataset is iterable style, supporting __iter__ vs map style
+        where  __getitem__/__len__ are the preferred access methods.
+
+        Returns
+        -------
+        bool
+            True if underlying dataset is iterable
+        """
+        if isinstance(self, (Dataset, IterableDataset)):
+            return isinstance(self, IterableDataset)
+        else:
+            return hasattr(self, "__iter__")
+
+    def is_map(self):
+        """
+        Returns true if underlying dataset is map style, supporting __getitem__/__len__ vs iterable
+        where __iter__ is the preferred access method.
+
+        Returns
+        -------
+        bool
+            True if underlying dataset is map-style
+        """
+        if isinstance(self, (Dataset, IterableDataset)):
+            # All torch IterableDatasets are also Datasets
+            return not isinstance(self, IterableDataset)
+        else:
+            return hasattr(self, "__getitem__")
+
     @property
     def config(self):
         return self._config
 
     def __init_subclass__(cls):
-        from torch.utils.data import IterableDataset
+        from abc import ABC
 
-        if IterableDataset in cls.__bases__ or hasattr(cls, "__iter__"):
-            logger.error("Hyrax does not fully support iterable data sets yet. Proceed at your own risk.")
+        if ABC in cls.__bases__:
+            return
 
         # Paranoia. Deriving from a torch dataset class should ensure this, but if an external dataset author
         # Forgets to to do that, we tell them.
@@ -126,10 +158,10 @@ def ids(self) -> Generator[str]:
             A generator yielding all the string IDs of the dataset.
 
         """
-        if hasattr(self, "__len__"):
+        if self.is_map():
             for x in range(len(self)):
                 yield str(x)
-        elif hasattr(self, "__iter__"):
+        elif self.is_iterable():
             for index, _ in enumerate(iter(self)):
                 yield (str(index))
         else:
@@ -145,10 +177,10 @@ def shape(self) -> tuple:
         tuple
             Shape tuple of the tensor that will be returned from the dataset.
         """
-        if hasattr(self, "__getitem__"):
+        if self.is_map():
             data_sample = self[0]
             return data_sample[0].shape if isinstance(data_sample, tuple) else data_sample.shape
-        elif hasattr(self, "__iter__"):
+        elif self.is_iterable():
             data_sample = next(iter(self))
             return data_sample[0].shape if isinstance(data_sample, tuple) else data_sample.shape
         else:
diff --git a/src/hyrax/data_sets/hsc_data_set.py b/src/hyrax/data_sets/hsc_data_set.py
@@ -321,7 +321,7 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in
                 # Drop objects that can't meet the cutout size provided
                 for shape in self.dims[object_id]:
                     if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]:
-                        msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
+                        msg = f"A file for object {object_id} has shape ({shape[0]}px, {shape[1]}px)"
                         msg += " this is too small for the given cutout size of "
                         msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)"
                         self._mark_for_prune(object_id, msg)
diff --git a/src/hyrax/data_sets/hyrax_cifar_data_set.py b/src/hyrax/data_sets/hyrax_cifar_data_set.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torchvision.transforms as transforms
 from astropy.table import Table
+from torch.utils.data import IterableDataset
 from torchvision.datasets import CIFAR10
 
 from hyrax.config_utils import ConfigDict
@@ -14,8 +15,8 @@
 
 
 class HyraxCifarDataSet(HyraxDataset, CIFAR10):
-    """This is simply a version of CIFAR10 that has our needed shape method, and is initialized using
-    Hyrax config with a transformation that works well for example code.
+    """This is simply a version of CIFAR10 that is initialized using Hyrax config with a transformation
+    that works well for example code.
 
     We only use the training split in the data, because it is larger (50k images). Hyrax will then divide that
     into Train/test/Validate according to configuration.
@@ -30,3 +31,28 @@ def __init__(self, config: ConfigDict):
         )
         metadata_table = Table({"label": np.array([self[index][1] for index in range(len(self))])})
         super().__init__(config, metadata_table)
+
+
+class HyraxCifarIterableDataSet(HyraxDataset, IterableDataset):
+    """This is simply a version of CIFAR10 that is initialized using Hyrax config with a transformation
+    that works well for example code. This version only supports iteration, and not map-style access
+
+    We only use the training split in the data, because it is larger (50k images). Hyrax will then divide that
+    into Train/test/Validate according to configuration.
+    """
+
+    def __init__(self, config: ConfigDict):
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+        )
+        self.cifar = CIFAR10(
+            root=config["general"]["data_dir"], train=True, download=True, transform=transform
+        )
+        metadata_table = Table(
+            {"label": np.array([self.cifar[index][1] for index in range(len(self.cifar))])}
+        )
+        super().__init__(config, metadata_table)
+
+    def __iter__(self):
+        for item in self.cifar:
+            yield item
diff --git a/src/hyrax/infer.py b/src/hyrax/infer.py
@@ -43,7 +43,8 @@ def run(config: ConfigDict):
     data_set = setup_dataset(config, tensorboardx_logger)
 
     model = setup_model(config, data_set)
-    logger.info(f"data set has length {len(data_set)}")  # type: ignore[arg-type]
+    if data_set.is_map():
+        logger.info(f"data set has length {len(data_set)}")  # type: ignore[arg-type]
     data_loader = dist_data_loader(data_set, config, split=config["infer"]["split"])
 
     log_runtime_config(config, results_dir)
diff --git a/src/hyrax/pytorch_ignite.py b/src/hyrax/pytorch_ignite.py
@@ -12,7 +12,7 @@
     import mlflow
 
 import torch
-from ignite.engine import Engine, Events
+from ignite.engine import Engine, EventEnum, Events
 from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine
 from ignite.handlers.tqdm_logger import ProgressBar
 from tensorboardX import SummaryWriter
@@ -103,6 +103,9 @@ def dist_data_loader(
     For multiple splits, we return a dictionary where the keys are the names of the splits
     and the value is either a Dataloader as described above or the value None if the split
     was not configured.
+
+    If an iterable dataset is passed, we cannot create multiple splits with a pyTorch sampler object
+    so we return the same thing for all splits, which is a dataloader representing the entire iterable
     """
     # Handle case where no split is needed.
     if isinstance(split, bool):
@@ -118,18 +121,25 @@ def dist_data_loader(
     if seed is not None:
         torch_rng.manual_seed(seed)
 
-    # Create the indexes for all splits based on config.
-    indexes = create_splits(data_set, config)
-
-    # Create samplers and dataloaders for each split we are interested in
-    samplers = {
-        s: SubsetRandomSampler(indexes[s], generator=torch_rng) if indexes.get(s) else None for s in split
-    }
-
-    dataloaders = {
-        split: idist.auto_dataloader(data_set, sampler=sampler, **config["data_loader"]) if sampler else None
-        for split, sampler in samplers.items()
-    }
+    if data_set.is_iterable():
+        dataloaders = {
+            s: idist.auto_dataloader(data_set, pin_memory=True, **config["data_loader"]) for s in split
+        }
+    else:
+        # Create the indexes for all splits based on config.
+        indexes = create_splits(data_set, config)
+
+        # Create samplers and dataloaders for each split we are interested in
+        samplers = {
+            s: SubsetRandomSampler(indexes[s], generator=torch_rng) if indexes.get(s) else None for s in split
+        }
+
+        dataloaders = {
+            split: idist.auto_dataloader(data_set, sampler=sampler, **config["data_loader"])
+            if sampler
+            else None
+            for split, sampler in samplers.items()
+        }
 
     # Return only one if we were only passed one split in, return the dictionary otherwise.
     return dataloaders[split[0]] if len(split) == 1 else dataloaders
@@ -363,6 +373,7 @@ def create_validator(
     model = idist.auto_model(model)
 
     validator = create_engine("train_step", device, model)
+    fixup_engine(validator)
 
     @validator.on(Events.STARTED)
     def set_model_to_eval_mode():
@@ -372,12 +383,12 @@ def set_model_to_eval_mode():
     def set_model_to_train_mode():
         model.train()
 
-    @validator.on(Events.EPOCH_COMPLETED)
+    @validator.on(HyraxEvents.HYRAX_EPOCH_COMPLETED)
     def log_training_loss():
         logger.debug(f"Validation run time: {validator.state.times['EPOCH_COMPLETED']:.2f}[s]")
         logger.debug(f"Validation metrics: {validator.state.output}")
 
-    @trainer.on(Events.EPOCH_COMPLETED)
+    @trainer.on(HyraxEvents.HYRAX_EPOCH_COMPLETED)
     def run_validation():
         validator.run(validation_data_loader)
 
@@ -386,7 +397,7 @@ def log_validation_loss(validator, trainer):
         tensorboardx_logger.add_scalar("training/validation/loss", validator.state.output["loss"], step)
         mlflow.log_metrics({"validation/loss": validator.state.output["loss"]}, step=step)
 
-    validator.add_event_handler(Events.EPOCH_COMPLETED, log_validation_loss, trainer)
+    validator.add_event_handler(HyraxEvents.HYRAX_EPOCH_COMPLETED, log_validation_loss, trainer)
 
     return validator
 
@@ -419,6 +430,7 @@ def create_trainer(
     model.train()
     model = idist.auto_model(model)
     trainer = create_engine("train_step", device, model)
+    fixup_engine(trainer)
 
     optimizer = extract_model_method(model, "optimizer")
 
@@ -435,18 +447,19 @@ def create_trainer(
         to_save,
         DiskSaver(results_directory, require_empty=False),
         n_saved=1,
-        global_step_transform=global_step_from_engine(trainer),
+        global_step_transform=global_step_from_engine(trainer, Events.EPOCH_COMPLETED),
         filename_pattern="{name}_epoch_{global_step}.{ext}",
     )
 
     def neg_loss_score(engine):
+        print(engine.state)
         return -engine.state.output["loss"]
 
     best_checkpoint = Checkpoint(
         to_save,
         DiskSaver(results_directory, require_empty=False),
         n_saved=1,
-        global_step_transform=global_step_from_engine(trainer),
+        global_step_transform=global_step_from_engine(trainer, Events.EPOCH_COMPLETED),
         score_name="loss",
         score_function=neg_loss_score,
         greater_or_equal=True,
@@ -473,13 +486,13 @@ def log_training_loss_tensorboard(trainer):
         tensorboardx_logger.add_scalar("training/training/loss", trainer.state.output["loss"], step)
         mlflow.log_metrics({"training/loss": trainer.state.output["loss"]}, step=step)
 
-    @trainer.on(Events.EPOCH_COMPLETED)
+    @trainer.on(HyraxEvents.HYRAX_EPOCH_COMPLETED)
     def log_training_loss(trainer):
         logger.debug(f"Epoch {trainer.state.epoch} run time: {trainer.state.times['EPOCH_COMPLETED']:.2f}[s]")
         logger.debug(f"Epoch {trainer.state.epoch} metrics: {trainer.state.output}")
 
-    trainer.add_event_handler(Events.EPOCH_COMPLETED, latest_checkpoint)
-    trainer.add_event_handler(Events.EPOCH_COMPLETED, best_checkpoint)
+    trainer.add_event_handler(HyraxEvents.HYRAX_EPOCH_COMPLETED, latest_checkpoint)
+    trainer.add_event_handler(HyraxEvents.HYRAX_EPOCH_COMPLETED, best_checkpoint)
 
     @trainer.on(Events.COMPLETED)
     def log_total_time(trainer):
@@ -498,3 +511,38 @@ def log_best_checkpoint_location(_, best_checkpoint):
     pbar.attach(trainer)
 
     return trainer
+
+
+class HyraxEvents(EventEnum):
+    """
+    Workaround event for a pytorch ignite bug. See fixup_engine for details
+    """
+
+    HYRAX_EPOCH_COMPLETED = "HyraxEpochCompleted"
+
+
+def fixup_engine(engine: Engine) -> Engine:
+    """
+    Workaround for this pytorch ignite bug (https://github.com/pytorch/ignite/issues/3372) where
+    engine.state.output is not available at EPOCH_COMPLETED or later times (COMPLETED, etc)
+
+    We create a new event HYRAX_EPOCH_COMPLETED which triggers at ITERATION_COMPLETED, but only on the final
+    iteration. This is just before the erronious state reset.
+
+    This hack relies on pytorch ignite internal state, but can be removed as soon as our fix is mainlined
+    (https://github.com/pytorch/ignite/pull/3373) in version 0.6.0 estimated August 2025
+    """
+    from more_itertools import peekable
+
+    engine.register_events(*HyraxEvents)
+
+    @engine.on(Events.ITERATION_COMPLETED)
+    def maintain_event_handler(engine):
+        # Ensure we have a peekable iterator in the engine.
+        if not hasattr(engine._dataloader_iter, "peek"):
+            # Replace with a pass-through peekable iterator
+            engine._dataloader_iter = peekable(engine._dataloader_iter)
+
+        # On the last iteration the peekable iterator evaluates as true
+        if not engine._dataloader_iter:
+            engine.fire_event(HyraxEvents.HYRAX_EPOCH_COMPLETED)
diff --git a/tests/hyrax/test_e2e.py b/tests/hyrax/test_e2e.py