Adding tests that use the PytorchBigWigDataset object. Also adding custom_position_sampler and custom_track_sampler options to the dataset objects.

jorenretel · jorenretel · commit 6bf417f2446f · 2025-04-29T22:40:34.000+02:00
diff --git a/bigwig_loader/dataset.py b/bigwig_loader/dataset.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -78,6 +79,12 @@ class BigWigDataset:
             GPU. More threads means that more IO can take place while the GPU is busy doing
             calculations (decompressing or neural network training for example). More threads
             also means a higher GPU memory usage. Default: 4
+        custom_position_sampler: if set, this sampler will be used instead of the default
+            position sampler (which samples randomly and uniform from regions of interest)
+            This should be an iterable of tuples (chromosome, center).
+        custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
+            specified, each batch simply contains all tracks, or a randomly sellected subset of
+            tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
         return_batch_objects: if True, the batches will be returned as instances of
             bigwig_loader.batch.Batch
     """
@@ -107,6 +114,8 @@ def __init__(
         repeat_same_positions: bool = False,
         sub_sample_tracks: Optional[int] = None,
         n_threads: int = 4,
+        custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
+        custom_track_sampler: Optional[Iterable[list[int]]] = None,
         return_batch_objects: bool = False,
     ):
         super().__init__()
@@ -152,32 +161,34 @@ def __init__(
         self._sub_sample_tracks = sub_sample_tracks
         self._n_threads = n_threads
         self._return_batch_objects = return_batch_objects
-
-    def _create_dataloader(self) -> StreamedDataloader:
-        position_sampler = RandomPositionSampler(
+        self._position_sampler = custom_position_sampler or RandomPositionSampler(
             regions_of_interest=self.regions_of_interest,
             buffer_size=self._position_sampler_buffer_size,
             repeat_same=self._repeat_same_positions,
         )
+        if custom_track_sampler is not None:
+            self._track_sampler: Optional[Iterable[list[int]]] = custom_track_sampler
+        elif sub_sample_tracks is not None:
+            self._track_sampler = TrackSampler(
+                total_number_of_tracks=len(self.bigwig_collection),
+                sample_size=sub_sample_tracks,
+            )
+        else:
+            self._track_sampler = None
 
+    def _create_dataloader(self) -> StreamedDataloader:
         sequence_sampler = GenomicSequenceSampler(
             reference_genome_path=self.reference_genome_path,
             sequence_length=self.sequence_length,
-            position_sampler=position_sampler,
+            position_sampler=self._position_sampler,
             maximum_unknown_bases_fraction=self.maximum_unknown_bases_fraction,
         )
-        track_sampler = None
-        if self._sub_sample_tracks is not None:
-            track_sampler = TrackSampler(
-                total_number_of_tracks=len(self.bigwig_collection),
-                sample_size=self._sub_sample_tracks,
-            )
 
         query_batch_generator = QueryBatchGenerator(
             genomic_location_sampler=sequence_sampler,
             center_bin_to_predict=self.center_bin_to_predict,
             batch_size=self.super_batch_size,
-            track_sampler=track_sampler,
+            track_sampler=self._track_sampler,
         )
 
         return StreamedDataloader(
diff --git a/bigwig_loader/pytorch.py b/bigwig_loader/pytorch.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -165,6 +166,12 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
             also means a higher GPU memory usage. Default: 4
         return_batch_objects: if True, the batches will be returned as instances of
             bigwig_loader.pytorch.PytorchBatch
+        custom_position_sampler: if set, this sampler will be used instead of the default
+            position sampler (which samples randomly and uniform from regions of interest)
+            This should be an iterable of tuples (chromosome, center).
+        custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
+            specified, each batch simply contains all tracks, or a randomly sellected subset of
+            tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
     """
 
     def __init__(
@@ -192,6 +199,8 @@ def __init__(
         repeat_same_positions: bool = False,
         sub_sample_tracks: Optional[int] = None,
         n_threads: int = 4,
+        custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
+        custom_track_sampler: Optional[Iterable[list[int]]] = None,
         return_batch_objects: bool = False,
     ):
         super().__init__()
@@ -217,6 +226,8 @@ def __init__(
             repeat_same_positions=repeat_same_positions,
             sub_sample_tracks=sub_sample_tracks,
             n_threads=n_threads,
+            custom_position_sampler=custom_position_sampler,
+            custom_track_sampler=custom_track_sampler,
             return_batch_objects=True,
         )
         self._return_batch_objects = return_batch_objects
diff --git a/bigwig_loader/sampler/genome_sampler.py b/bigwig_loader/sampler/genome_sampler.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Any
 from typing import Callable
+from typing import Iterable
 from typing import Iterator
 from typing import Literal
 from typing import Optional
@@ -21,7 +22,7 @@ def __init__(
         self,
         reference_genome_path: Path,
         sequence_length: int,
-        position_sampler: Iterator[tuple[str, int]],
+        position_sampler: Iterable[tuple[str, int]],
         maximum_unknown_bases_fraction: float = 0.1,
     ):
         self.reference_genome_path = reference_genome_path
diff --git a/bigwig_loader/sampler/position_sampler.py b/bigwig_loader/sampler/position_sampler.py
@@ -3,7 +3,13 @@
 import numpy as np
 import pandas as pd
 
-from bigwig_loader.util import make_cumulative_index_intervals
+
+def make_cumulative_index_intervals(intervals: pd.DataFrame) -> pd.DataFrame:
+    intervals.reset_index(drop=True, inplace=True)
+    intervals.index = (
+        (intervals["end"] - intervals["start"]).cumsum().shift().fillna(0).astype(int)  # type: ignore
+    )
+    return intervals
 
 
 class RandomPositionSampler:
@@ -22,6 +28,8 @@ def __init__(
         self._repeat_same = repeat_same
 
     def __iter__(self) -> Iterator[tuple[str, int]]:
+        if self._repeat_same:
+            self._index = 0
         return self
 
     def __next__(self) -> tuple[str, int]:
@@ -36,6 +44,7 @@ def __next__(self) -> tuple[str, int]:
         return chromosome, center
 
     def _refresh_buffer(self) -> None:
+        print("refresh buffer called")
         batch_rand = np.random.randint(
             low=0, high=self._max_index, size=self.buffer_size
         )
diff --git a/bigwig_loader/util.py b/bigwig_loader/util.py
@@ -28,14 +28,6 @@ def sort_intervals(intervals: pd.DataFrame, inplace: bool = False) -> pd.DataFra
         )
 
 
-def make_cumulative_index_intervals(intervals: pd.DataFrame) -> pd.DataFrame:
-    intervals.reset_index(drop=True, inplace=True)
-    intervals.index = (
-        (intervals["end"] - intervals["start"]).cumsum().shift().fillna(0).astype(int)  # type: ignore
-    )
-    return intervals
-
-
 _string_to_encoding = {
     "A": [1.0, 0.0, 0.0, 0.0],
     "C": [0.0, 1.0, 0.0, 0.0],
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,11 +5,11 @@
 import pytest
 
 from bigwig_loader import config
+from bigwig_loader.download_example_data import get_example_bigwigs_files
+from bigwig_loader.download_example_data import get_reference_genome
 
 try:
     from bigwig_loader.collection import BigWigCollection
-    from bigwig_loader.download_example_data import get_example_bigwigs_files
-    from bigwig_loader.download_example_data import get_reference_genome
 except ImportError:
     logging.warning(
         "Can not import from bigwig_loader.collection without cupy installed"
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -73,3 +73,38 @@ def test_batch_return_type(bigwig_path, reference_genome_path, merged_intervals)
     for i, batch in enumerate(dataset):
         assert isinstance(batch, Batch)
         assert batch.track_indices is not None
+
+
+def test_positions_are_reproducible(
+    bigwig_path, reference_genome_path, merged_intervals
+):
+    batch_size = 16
+
+    dataset = BigWigDataset(
+        regions_of_interest=merged_intervals,
+        collection=bigwig_path,
+        reference_genome_path=reference_genome_path,
+        sequence_length=2000,
+        center_bin_to_predict=1000,
+        window_size=4,
+        batch_size=batch_size,
+        batches_per_epoch=10,
+        maximum_unknown_bases_fraction=0.1,
+        first_n_files=2,
+        repeat_same_positions=True,
+        n_threads=1,
+        return_batch_objects=True,
+    )
+
+    starts_a = [
+        position
+        for batch in dataset
+        for position in zip(batch.chromosomes, batch.starts)
+    ]
+    starts_b = [
+        position
+        for batch in dataset
+        for position in zip(batch.chromosomes, batch.starts)
+    ]
+
+    assert starts_a == starts_b
diff --git a/tests/test_position_sampler.py b/tests/test_position_sampler.py
@@ -0,0 +1,39 @@
+from bigwig_loader.sampler.position_sampler import RandomPositionSampler
+
+
+def test_repeat_same_positions(merged_intervals):
+    sampler = RandomPositionSampler(
+        regions_of_interest=merged_intervals, repeat_same=True
+    )
+
+    first_samples = []
+    for i, sample in enumerate(sampler):
+        first_samples.append(sample)
+        if i == 5:
+            break
+    second_samples = []
+    for i, sample in enumerate(sampler):
+        second_samples.append(sample)
+        if i == 5:
+            break
+
+    assert first_samples == second_samples
+
+
+def test_not_repeat_same_positions(merged_intervals):
+    sampler = RandomPositionSampler(
+        regions_of_interest=merged_intervals, repeat_same=False
+    )
+
+    first_samples = []
+    for i, sample in enumerate(sampler):
+        first_samples.append(sample)
+        if i == 5:
+            break
+    second_samples = []
+    for i, sample in enumerate(sampler):
+        second_samples.append(sample)
+        if i == 5:
+            break
+
+    assert first_samples != second_samples
diff --git a/tests/test_pytorch_dataset.py b/tests/test_pytorch_dataset.py
@@ -1,5 +1,10 @@
+from math import isnan
+
+import pandas as pd
 import pytest
 
+from bigwig_loader import config
+
 torch = pytest.importorskip("torch")
 
 
@@ -30,3 +35,81 @@ def test_input_and_target_is_torch_tensor(pytorch_dataset):
     sequence, target = next(iter(pytorch_dataset))
     assert isinstance(sequence, torch.Tensor)
     assert isinstance(target, torch.Tensor)
+
+
+@pytest.mark.parametrize("default_value", [0.0, torch.nan, 4.0, 5.6])
+def test_pytorch_dataset_with_window_function(
+    default_value, bigwig_path, reference_genome_path, merged_intervals
+):
+    from bigwig_loader.pytorch import PytorchBigWigDataset
+
+    center_bin_to_predict = 2048
+    window_size = 128
+    reduced_dim = center_bin_to_predict // window_size
+
+    batch_size = 16
+
+    df = pd.read_csv(config.example_positions, sep="\t")
+    df = df[df["chr"].isin({"chr1", "chr3", "chr5"})]
+    chromosomes = list(df["chr"])[:batch_size]
+    centers = list(df["center"])[:batch_size]
+
+    position_sampler = [(chrom, center) for chrom, center in zip(chromosomes, centers)]
+
+    dataset = PytorchBigWigDataset(
+        regions_of_interest=merged_intervals,
+        collection=bigwig_path,
+        reference_genome_path=reference_genome_path,
+        sequence_length=center_bin_to_predict * 2,
+        center_bin_to_predict=center_bin_to_predict,
+        window_size=1,
+        batch_size=batch_size,
+        batches_per_epoch=1,
+        maximum_unknown_bases_fraction=0.1,
+        first_n_files=3,
+        custom_position_sampler=position_sampler,
+        default_value=default_value,
+        return_batch_objects=True,
+    )
+
+    dataset_with_window = PytorchBigWigDataset(
+        regions_of_interest=merged_intervals,
+        collection=bigwig_path,
+        reference_genome_path=reference_genome_path,
+        sequence_length=center_bin_to_predict * 2,
+        center_bin_to_predict=center_bin_to_predict,
+        window_size=window_size,
+        batch_size=batch_size,
+        batches_per_epoch=1,
+        maximum_unknown_bases_fraction=0.1,
+        first_n_files=3,
+        custom_position_sampler=position_sampler,
+        default_value=default_value,
+        return_batch_objects=True,
+    )
+
+    print(dataset_with_window._dataset.bigwig_collection.bigwig_paths)
+
+    for batch, batch_with_window in zip(dataset, dataset_with_window):
+        print(batch)
+        print(batch_with_window)
+        print(batch.chromosomes)
+        print(batch_with_window.chromosomes)
+        print(batch.starts)
+        print(batch_with_window.starts)
+        print(batch.ends)
+        print(batch_with_window.ends)
+        expected = batch.values.reshape(
+            batch.values.shape[0], batch.values.shape[1], reduced_dim, window_size
+        )
+        if not isnan(default_value) or default_value == 0:
+            expected = torch.nan_to_num(expected, nan=default_value)
+        expected = torch.nanmean(expected, axis=-1)
+        print("---")
+        print("expected")
+        print(expected)
+        print("batch_with_window")
+        print(batch_with_window.values)
+        assert torch.allclose(expected, batch_with_window.values, equal_nan=True)
+        if isnan(default_value):
+            assert torch.isnan(batch_with_window.values).any()
diff --git a/tests/test_window_function.py b/tests/test_window_function.py