pytorch · bjuncek · Oct 21, 2021 · Oct 21, 2021 · Oct 31, 2021 · Nov 2, 2021
diff --git a/main.py b/main.py
@@ -0,0 +1,32 @@
+from torchvision.prototype import datasets
+from torchvision.prototype.datasets.video_utils import AVKeyframeReader, AVRandomFrameReader, AVClipReader
+
+
+
+print("\n \n KEYFRAMES \n \n")
+ct = 0
+dataset = AVKeyframeReader(datasets.load("ucf101"))
+for i in dataset:
+    print(i)
+    ct += 1
+    if ct > 5:
+        break 
+
+
+print("\n \n RANDOM FRAMES")
+ct = 0
+dataset = AVRandomFrameReader(datasets.load("ucf101"), num_samples=3)
+for i in dataset:
+    print(i)
+    ct += 1
+    if ct > 5:
+        break
+
+print("\n \n CLIPS ")
+ct = 0
+dataset = AVClipReader(datasets.load("ucf101"), num_frames_per_clip=16, num_clips_per_video=8)
+for i in dataset:
+    print(i['path'], i["range"])
+    ct += 1
+    if ct > 5:
+        break  
diff --git a/torchvision/prototype/datasets/_api.py b/torchvision/prototype/datasets/_api.py
@@ -54,6 +54,7 @@ def info(name: str) -> DatasetInfo:
 DEFAULT_DECODER: Dict[DatasetType, Callable[[io.IOBase], torch.Tensor]] = {
     DatasetType.RAW: raw,
     DatasetType.IMAGE: pil,
+    DatasetType.VIDEO: None,
 }
 
 

diff --git a/torchvision/prototype/datasets/_builtin/__init__.py b/torchvision/prototype/datasets/_builtin/__init__.py
@@ -1,6 +1,7 @@
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import Cifar10, Cifar100
+from .ucf101 import ucf101
 from .coco import Coco
 from .imagenet import ImageNet
 from .mnist import MNIST, FashionMNIST, KMNIST, EMNIST, QMNIST

diff --git a/torchvision/prototype/datasets/_builtin/ucf101.py b/torchvision/prototype/datasets/_builtin/ucf101.py
@@ -0,0 +1,89 @@
+import io
+import pathlib
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from torchvision.prototype.datasets.utils._internal import RarArchiveReader, INFINITE_BUFFER_SIZE
+
+import torch
+from torchdata.datapipes.iter import CSVParser, KeyZipper
+from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.iter import (
+    Filter,
+    Mapper,
+    ZipArchiveReader,
+    Shuffler,
+)
+from torchvision.prototype.datasets.utils._internal import path_accessor, path_comparator
+from torchvision.prototype.datasets.utils import (
+    Dataset,
+    DatasetConfig,
+    DatasetInfo,
+    HttpResource,
+    OnlineResource,
+    DatasetType,
+)
+
+
+class ucf101(Dataset):
+    """This is a base datapipe that returns a file handler of the video.
+    What we want to do is implement either several decoder options or additional
+    datapipe extensions to make this work.
+    """
+    def _make_info(self) -> DatasetInfo:
+        return DatasetInfo(
+            "ucf101",
+            type=DatasetType.VIDEO,
+            valid_options={'split': ["train", "test"], 'fold': ["1", "2", "3"]},
+            homepage="https://www.crcv.ucf.edu/data/UCF101.php",
+        )
+
+    def resources(self, config: DatasetConfig) -> List[OnlineResource]:
+        return [
+            HttpResource(
+                "https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip",
+                sha256="5c0d1a53b8ed364a2ac830a73f405e51bece7d98ce1254fd19ed4a36b224bd27",
+            ),
+            HttpResource(
+                "https://www.crcv.ucf.edu/data/UCF101/UCF101.rar",
+                sha256="ca8dfadb4c891cb11316f94d52b6b0ac2a11994e67a0cae227180cd160bd8e55",
+            )
+        ]
+
+    def _generate_categories(self, root: pathlib.Path) -> List[str]:
+        dp = self.resources(self.default_config)[1].to_datapipe(pathlib.Path(root) / self.name)
+        dp = RarArchiveReader(dp)
+        dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
+        return [name.split(".")[1] for name in sorted(dir_names)]
+
+    def _collate_and_decode(
+        self,
+        data: Tuple[Tuple[str, int], Tuple[str, io.IOBase]],
+        *,
+        decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
+        annotations_d, file_d = data
+        label = annotations_d[1]
+        _path, file_handle = file_d
+        file = decoder(file_handle) if decoder else file_handle
+        return {"path": _path, "file": file, "target": label}
+
+    def _make_datapipe(
+        self,
+        resource_dps: List[IterDataPipe],
+        *,
+        config: DatasetConfig,
+        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+    ) -> IterDataPipe[Dict[str, Any]]:
+
+        annotations = resource_dps[0]
+        files = resource_dps[1]
+
+        annotations_dp = ZipArchiveReader(annotations)
+        annotations_dp = Filter(annotations_dp, path_comparator("name", f"{config.split}list0{config.fold}.txt"))
+        annotations_dp = CSVParser(annotations_dp, delimiter=" ")
+        # COMMENT OUT FOR TESTING
+        annotations_dp = Shuffler(annotations_dp, buffer_size=INFINITE_BUFFER_SIZE)
+
+        files_dp = RarArchiveReader(files)
+        dp = KeyZipper(annotations_dp, files_dp, path_accessor("name"))
+        return Mapper(dp, self._collate_and_decode, fn_kwargs=dict(decoder=decoder))
diff --git a/torchvision/prototype/datasets/utils/_dataset.py b/torchvision/prototype/datasets/utils/_dataset.py
@@ -20,6 +20,7 @@
 class DatasetType(enum.Enum):
     RAW = enum.auto()
     IMAGE = enum.auto()
+    VIDEO = enum.auto()
 
 
 class DatasetConfig(FrozenBunch):
@@ -151,7 +152,7 @@ def _make_datapipe(
         resource_dps: List[IterDataPipe],
         *,
         config: DatasetConfig,
-        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
+        decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]],
     ) -> IterDataPipe[Dict[str, Any]]:
         pass
 
@@ -163,7 +164,7 @@ def to_datapipe(
         root: Union[str, pathlib.Path],
         *,
         config: Optional[DatasetConfig] = None,
-        decoder: Optional[Callable[[io.IOBase], torch.Tensor]] = None,
+        decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]] = None,
     ) -> IterDataPipe[Dict[str, Any]]:
         if not config:
             config = self.info.default_config

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
@@ -42,6 +42,7 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
+    "RarArchiveReader",
 ]
 
 K = TypeVar("K")
@@ -176,6 +177,38 @@ def __iter__(self) -> Iterator[Tuple[str, io.IOBase]]:
             yield path, decompressor(file)
 
 
+class RarArchiveReader(IterDataPipe[Tuple[str, io.BufferedIOBase]]):
+    def __init__(self, datapipe: IterDataPipe[Tuple[str, io.BufferedIOBase]]):
+        self._rarfile = self._verify_dependencies()
+        super().__init__()
+        self.datapipe = datapipe
+
+    @staticmethod
+    def _verify_dependencies():
+        try:
+            import rarfile
+        except ImportError as error:
+            raise ModuleNotFoundError(
+                "Package `rarfile` is required to be installed to use this datapipe. "
+                "Please use `pip install rarfile` or `conda -c conda-forge install rarfile` to install it."
+            ) from error
+
+        # check if at least one system library for reading rar archives is available to be used by rarfile
+        rarfile.tool_setup()
+
+        return rarfile
+
+    def __iter__(self) -> Iterator[Tuple[str, io.BufferedIOBase]]:
+        for path, stream in self.datapipe:
+            rar = self._rarfile.RarFile(stream)
+            for info in rar.infolist():
+                if info.filename.endswith("/"):
+                    continue
+
+                inner_path = os.path.join(path, info.filename)
+                file_obj = rar.open(info)
+
+                yield inner_path, file_obj
 class PicklerDataPipe(IterDataPipe):
     def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO[bytes]]]) -> None:
         self.source_datapipe = source_datapipe

diff --git a/torchvision/prototype/datasets/video_utils.py b/torchvision/prototype/datasets/video_utils.py
@@ -0,0 +1,171 @@
+from typing import Any, Dict, Iterator
+import random
+import av
+import numpy as np
+import torch
+from torchdata.datapipes.iter import IterDataPipe
+from torchvision.io import video, _video_opt
+
+class AVKeyframeReader(IterDataPipe[Dict[str, Any]]):
+    def __init__(self, video_dp: IterDataPipe[Dict[str, Any]]) -> None:
+        """TorchData Iterdatapype that takes in video datapipe
+        and yields all keyframes in a video
+
+        Args:
+            video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
+        """
+        self.datapipe = video_dp
+
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        for video_d in self.datapipe:
+            buffer = video_d.pop("file")
+            with av.open(buffer, metadata_errors="ignore") as container:
+                stream = container.streams.video[0]
+                stream.codec_context.skip_frame = 'NONKEY'
+                for frame in container.decode(stream):
+                    img = frame.to_image()
+                    yield dict(
+                        video_d,
+                        frame=img,
+                        pts=frame.pts,
+                        video_meta={
+                            "time_base": float(frame.time_base),
+                            "guessed_fps": float(stream.guessed_rate),
+                        })
+
+class AVRandomFrameReader(IterDataPipe[Dict[str, Any]]):
+    def __init__(self, video_dp: IterDataPipe[Dict[str, Any]], num_samples=1, transform=None) -> None:
+        """TorchData Iterdatapype that takes in video datapipe
+        and yields `num_samples` random frames from a video.
+
+        Args:
+            video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
+            num_samples (int, optional): Number of frames to sample from each video. Defaults to 1.
+        """
+        self.datapipe = video_dp
+        self.num_samples = num_samples
+
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        for video_d in self.datapipe:
+            buffer = video_d.pop("file")
+            with av.open(buffer, metadata_errors="ignore") as container:
+                stream = container.streams.video[0]
+                # duration is given in time_base units as int
+                duration = stream.duration
+                # seek to a random frame
+                seek_idxs = random.sample(list(range(duration)), self.num_samples)
+                for i in seek_idxs:
+                    container.seek(i, any_frame=True, stream=stream)
+                    frame = next(container.decode(stream))
+                    img = frame.to_image()
+
+                    video_meta = {"time_base": float(frame.time_base),
+                                  "guessed_fps": float(stream.guessed_rate)}
+
+                    yield dict(video_d, frame=img, pts=frame.pts, video_meta=video_meta)
+
+class AVClipReader(IterDataPipe[Dict[str, Any]]):
+    def __init__(
+            self,
+            video_dp: IterDataPipe[Dict[str, Any]],
+            num_frames_per_clip=8,
+            num_clips_per_video=1,
+            step_between_clips=1) -> None:
+        """TorchData Iterdatapype that takes in video datapipe
+        and yields `num_clips_per_video` video clips (sequences of `num_frames_per_clip` frames) from a video.
+        Clips are sampled from all possible clips of length `num_frames_per_clip` spaced `step_between_clips` apart.
+
+        Args:
+            video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
+            num_frames_per_clip (int, optional): Length of a video clip in frames. Defaults to 8.
+            num_clips_per_video (int, optional): How many clips to sample from each video. Defaults to 1.
+            step_between_clips (int, optional): Minimum spacing between two clips. Defaults to 1.
+        """
+
+        self.datapipe = video_dp
+        self.num_frames_per_clip = num_frames_per_clip
+        self.num_clips_per_video = num_clips_per_video
+        self.step_between_clips = step_between_clips
+
+    def _unfold(self, tensor, dilation=1):
+        """
+        similar to tensor.unfold, but with the dilation
+        and specialized for 1d tensors
+        Returns all consecutive windows of `self.num_frames_per_clip` elements, with
+        `self.step_between_clips` between windows. The distance between each element
+        in a window is given by `dilation`.
+        """
+        assert tensor.dim() == 1
+        o_stride = tensor.stride(0)
+        numel = tensor.numel()
+        new_stride = (self.step_between_clips * o_stride, dilation * o_stride)
+        new_size = ((numel - (dilation * (self.num_frames_per_clip - 1) + 1)) // self.step_between_clips + 1,
+                    self.num_frames_per_clip)
+        if new_size[0] < 1:
+            new_size = (0, self.num_frames_per_clip)
+        return torch.as_strided(tensor, new_size, new_stride)
+
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        for video_d in self.datapipe:
+            buffer = video_d["file"]
+            with av.open(buffer, metadata_errors="ignore") as container:
+                stream = container.streams.video[0]
+                time_base = stream.time_base
+
+                # duration is given in time_base units as int
+                duration = stream.duration
+
+                # get video_stream timestramps
+                # with a tolerance for pyav imprecission
+                _ptss = torch.arange(duration - 7)
+                _ptss = self._unfold(_ptss)
+                # shuffle the clips
+                perm = torch.randperm(_ptss.size(0))
+                idx = perm[:self.num_clips_per_video]
+                samples = _ptss[idx]
+
+                for clip_pts in samples:
+                    start_pts = clip_pts[0].item()
+                    end_pts = clip_pts[-1].item()
+                    # video_timebase is the default time_base
+                    pts_unit = "pts"
+                    start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(start_pts, end_pts, "pts", time_base)
+                    video_frames = video._read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        stream,
+                        {"video": 0},
+                    )
+
+                    vframes_list = [frame.to_ndarray(format='rgb24') for frame in video_frames]
+
+                    if vframes_list:
+                        vframes = torch.as_tensor(np.stack(vframes_list))
+                        # account for rounding errors in conversion
+                        # FIXME: fix this in the code
+                        vframes = vframes[:self.num_frames_per_clip, ...]
+
+                    else:
+                        vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+                        print("FAIL")
+
+                    # [N,H,W,C] to [N,C,H,W]
+                    vframes = vframes.permute(0, 3, 1, 2)
+                    assert(vframes.size(0) == self.num_frames_per_clip)
+
+                    # TODO: support sampling rates (FPS change)
+                    # TODO: optimization (read all and select)
+
+                    yield {
+                        "clip": vframes,
+                        "pts": clip_pts,
+                        "range": (start_pts, end_pts),
+                        "video_meta": {
+                            "time_base": float(stream.time_base),
+                            "guessed_fps": float(stream.guessed_rate),
+                        },
+                        "path": video_d["path"],
+                        "target": video_d["target"]
+                    }