Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Stereo Matching
CarlaStereo
Kitti2012Stereo
Kitti2015Stereo
FallingThingsStereo
SceneFlowStereo
SintelStereo
InStereo2k
Expand Down
63 changes: 63 additions & 0 deletions test/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2841,6 +2841,69 @@ def test_train_splits(self):
datasets_utils.shape_test_for_stereo(left, right, disparity)


class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
DATASET_CLASS = datasets.FallingThingsStereo
ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both"))
FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))

@staticmethod
def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
file = pathlib.Path(root) / name
image = np.ones((size[0], size[1]), dtype=np.uint8)
PIL.Image.fromarray(image).save(file)

@staticmethod
def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> None:
root = pathlib.Path(root) / scene_name
os.makedirs(root, exist_ok=True)
# jpg images
datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0]))
datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0]))
# single channel depth maps
FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
# camera settings json. Minimal example for _read_disparity function testing
settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
with open(root / "_camera_settings.json", "w") as f:
json.dump(settings_json, f)

def inject_fake_data(self, tmpdir, config):
fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
os.makedirs(fallingthings_dir, exist_ok=True)

num_examples = {"single": 2, "mixed": 3, "both": 4}.get(config["variant"], 0)
variants = {
"single": ["single"],
"mixed": ["mixed"],
"both": ["single", "mixed"],
}.get(config["variant"], [])

for variant_name in variants:
variant_dir = pathlib.Path(fallingthings_dir) / variant_name
os.makedirs(variant_dir, exist_ok=True)
for i in range(num_examples):
self._make_scene_folder(
root=variant_dir,
scene_name=f"scene_{i:06d}",
size=(100, 200),
)

if config["variant"] == "both":
num_examples *= 2
return num_examples

def test_splits(self):
for variant_name in ["single", "mixed"]:
with self.create_dataset(variant=variant_name) as (dataset, _):
for left, right, disparity in dataset:
datasets_utils.shape_test_for_stereo(left, right, disparity)

def test_bad_input(self):
with pytest.raises(ValueError, match="Unknown value 'bad' for argument variant"):
with self.create_dataset(variant="bad"):
pass


class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
DATASET_CLASS = datasets.SceneFlowStereo
ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
Expand Down
11 changes: 10 additions & 1 deletion torchvision/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
from ._optical_flow import FlyingChairs, FlyingThings3D, HD1K, KittiFlow, Sintel
from ._stereo_matching import CarlaStereo, InStereo2k, Kitti2012Stereo, Kitti2015Stereo, SceneFlowStereo, SintelStereo
from ._stereo_matching import (
CarlaStereo,
FallingThingsStereo,
InStereo2k,
Kitti2012Stereo,
Kitti2015Stereo,
SceneFlowStereo,
SintelStereo,
)
from .caltech import Caltech101, Caltech256
from .celeba import CelebA
from .cifar import CIFAR10, CIFAR100
Expand Down Expand Up @@ -109,6 +117,7 @@
"Kitti2012Stereo",
"Kitti2015Stereo",
"CarlaStereo",
"FallingThingsStereo",
"SceneFlowStereo",
"SintelStereo",
"InStereo2k",
Expand Down
99 changes: 99 additions & 0 deletions torchvision/datasets/_stereo_matching.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import json
import os
from abc import ABC, abstractmethod
from glob import glob
Expand Down Expand Up @@ -362,6 +363,104 @@ def __getitem__(self, index: int) -> Tuple:
return super().__getitem__(index)


class FallingThingsStereo(StereoMatchingDataset):
"""`FallingThings <https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset.

The dataset is expected to have the following structre: ::

root
FallingThings
single
scene1
_object_settings.json
_camera_settings.json
image1.left.depth.png
image1.right.depth.png
image1.left.jpg
image1.right.jpg
image2.left.depth.png
image2.right.depth.png
image2.left.jpg
image2.right
...
scene2
...
mixed
scene1
_object_settings.json
_camera_settings.json
image1.left.depth.png
image1.right.depth.png
image1.left.jpg
image1.right.jpg
image2.left.depth.png
image2.right.depth.png
image2.left.jpg
image2.right
...
scene2
...

Args:
root (string): Root directory where FallingThings is located.
variant (string): Which variant to use. Either "single", "mixed", or "both".
transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
"""

def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just trying to figure out whether there are relevant alternatives for "variant" here. Looking at the authors' README I could only find "scene type".

At the root level, there are two folders representing the two types of scenes:

  • single (single falling object), and
  • mixed (2 to 10 falling objects).

Not sure it's better though...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've got no particular feelings about this, it's definitely not a "split" therefor I just opted for "variant" to keep consistent with other datasets.

super().__init__(root, transforms)

root = Path(root) / "FallingThings"

verify_str_arg(variant, "variant", valid_values=("single", "mixed", "both"))

variants = {
"single": ["single"],
"mixed": ["mixed"],
"both": ["single", "mixed"],
}[variant]

for s in variants:
left_img_pattern = str(root / s / "*" / "*.left.jpg")
right_img_pattern = str(root / s / "*" / "*.right.jpg")
self._images += self._scan_pairs(left_img_pattern, right_img_pattern)

left_disparity_pattern = str(root / s / "*" / "*.left.depth.png")
right_disparity_pattern = str(root / s / "*" / "*.right.depth.png")
self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)

def _read_disparity(self, file_path: str) -> Tuple:
# (H, W) image
depth = np.asarray(Image.open(file_path))
# as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
# in order to extract disparity from depth maps
camera_settings_path = Path(file_path).parent / "_camera_settings.json"
with open(camera_settings_path, "r") as f:
# inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constatnt)
intrinsics = json.load(f)
focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
baseline, pixel_constant = 6, 100 # pixel constant is inverted
disparity_map = (baseline * focal * pixel_constant) / depth.astype(np.float32)
# unsqueeze disparity to (C, H, W)
disparity_map = disparity_map[None, :, :]
valid_mask = None
return disparity_map, valid_mask

def __getitem__(self, index: int) -> Tuple:
"""Return example at given index.

Args:
index(int): The index of the example to retrieve

Returns:
tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
If a ``valid_mask`` is generated within the ``transforms`` parameter,
a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
"""
return super().__getitem__(index)


class SceneFlowStereo(StereoMatchingDataset):
"""Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
This interface provides access to the `FlyingThings3D, `Monkaa` and `Driving` datasets.
Expand Down