Skip to content

[WIP] UCF101 prototype with utilities for video loading #4838

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 52 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
9c9b27e
stash
bjuncek Oct 21, 2021
e00c095
Merge branch 'pytorch:main' into bkorbar/prototypes/ucf101
bjuncek Oct 21, 2021
914380f
base implementation
Oct 31, 2021
9dd6786
Format and add documentation to the video utilities
Nov 2, 2021
7ad8357
simple driver for Philip to play with
Nov 2, 2021
dc205e9
format ucf101 and lint stuff
Nov 2, 2021
711adf3
Update torchvision/prototype/datasets/_builtin/ucf101.py
bjuncek Nov 3, 2021
56c1779
Update torchvision/prototype/datasets/_builtin/ucf101.py
bjuncek Nov 3, 2021
65f3c64
Update torchvision/prototype/datasets/video_utils.py
bjuncek Nov 3, 2021
017e9b9
Merge branch 'main' into bkorbar/prototypes/ucf101
bjuncek Nov 3, 2021
666ca6e
Merge branch 'main' into bkorbar/prototypes/ucf101
bjuncek Nov 3, 2021
acc0e54
address https://github.com/pytorch/vision/pull/4838#pullrequestreview…
Nov 10, 2021
c209153
Update torchvision/prototype/datasets/_builtin/ucf101.py
bjuncek Nov 10, 2021
31c0eb7
Merge branch 'bkorbar/prototypes/ucf101' of https://github.com/bjunce…
Nov 10, 2021
f5eb8fd
use internal utils
Nov 10, 2021
0a66ff0
remove transform antipattern
Nov 10, 2021
d29d22b
change return/pop stuff
Nov 10, 2021
cf4f354
remove unnecessary and uncalled methods
Nov 10, 2021
52b2b67
make changes to catch up with the master
Nov 12, 2021
5e2f15d
minor flake
Nov 12, 2021
b608f6d
lint
Nov 12, 2021
4f281c4
add video default decoder
Nov 12, 2021
ab6a2b8
revert changes to the decoder
bjuncek Nov 12, 2021
9800f8e
Apply suggestions from code review
bjuncek Nov 12, 2021
64b644f
apply suggestions from code review
Nov 12, 2021
7557931
Merge branch 'bkorbar/prototypes/ucf101' of https://github.com/bjunce…
Nov 12, 2021
18eb9c0
Merge branch 'main' into bkorbar/prototypes/ucf101
pmeier Nov 12, 2021
587723e
Update torchvision/prototype/datasets/_builtin/ucf101.py
bjuncek Nov 28, 2021
a3737ab
addressing comments 1
Nov 28, 2021
8fce5ff
remove shuffler comment
Nov 28, 2021
a10a3a0
remove main.py
Nov 28, 2021
697fdfd
clange and flake being mad at me
Nov 28, 2021
ebef4f2
Merge branch 'main' into bkorbar/prototypes/ucf101
bjuncek Nov 28, 2021
62078b6
Merge branch 'main' into bkorbar/prototypes/ucf101
bjuncek Dec 1, 2021
a574089
addig type annotations
Dec 1, 2021
d809cb9
pass flake8
Dec 1, 2021
8f57ee6
Decoder typing change
Dec 1, 2021
8f21f0e
remove unused parameters
Dec 1, 2021
8dbda84
fixing _api with decoder changes
Dec 1, 2021
788d82a
build errors
Dec 1, 2021
31a8929
remove unused
Dec 1, 2021
84cdecb
Merge branch 'main' into bkorbar/prototypes/ucf101
pmeier Dec 2, 2021
4386c48
fix python lint
pmeier Dec 2, 2021
97bd457
cleanup decoder
pmeier Dec 2, 2021
4609783
mypy fix
Dec 2, 2021
1c77e6f
Merge branch 'bkorbar/prototypes/ucf101' of https://github.com/bjunce…
Dec 2, 2021
6019ce7
[DIRTY] Merge branch 'main' into bkorbar/prototypes/ucf101
pmeier Dec 16, 2021
25c3668
revert decoder changes
pmeier Dec 16, 2021
08a616c
add categories and fix data loading
pmeier Dec 16, 2021
0675649
cleanup
pmeier Dec 16, 2021
381f70e
Merge branch 'main' into bkorbar/prototypes/ucf101
pmeier Dec 19, 2021
f1a69e0
use shuffling hint
pmeier Dec 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from torchvision.prototype import datasets
from torchvision.prototype.datasets.video_utils import AVKeyframeReader, AVRandomFrameReader, AVClipReader



print("\n \n KEYFRAMES \n \n")
ct = 0
dataset = AVKeyframeReader(datasets.load("ucf101"))
for i in dataset:
print(i)
ct += 1
if ct > 5:
break


print("\n \n RANDOM FRAMES")
ct = 0
dataset = AVRandomFrameReader(datasets.load("ucf101"), num_samples=3)
for i in dataset:
print(i)
ct += 1
if ct > 5:
break

print("\n \n CLIPS ")
ct = 0
dataset = AVClipReader(datasets.load("ucf101"), num_frames_per_clip=16, num_clips_per_video=8)
for i in dataset:
print(i['path'], i["range"])
ct += 1
if ct > 5:
break
1 change: 1 addition & 0 deletions torchvision/prototype/datasets/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def info(name: str) -> DatasetInfo:
DEFAULT_DECODER: Dict[DatasetType, Callable[[io.IOBase], torch.Tensor]] = {
DatasetType.RAW: raw,
DatasetType.IMAGE: pil,
DatasetType.VIDEO: None,
}


Expand Down
1 change: 1 addition & 0 deletions torchvision/prototype/datasets/_builtin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .caltech import Caltech101, Caltech256
from .celeba import CelebA
from .cifar import Cifar10, Cifar100
from .ucf101 import ucf101
from .coco import Coco
from .imagenet import ImageNet
from .mnist import MNIST, FashionMNIST, KMNIST, EMNIST, QMNIST
Expand Down
89 changes: 89 additions & 0 deletions torchvision/prototype/datasets/_builtin/ucf101.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import io
import pathlib
from typing import Any, Callable, Dict, List, Optional, Tuple

from torchvision.prototype.datasets.utils._internal import RarArchiveReader, INFINITE_BUFFER_SIZE

import torch
from torchdata.datapipes.iter import CSVParser, KeyZipper
from torch.utils.data import IterDataPipe
from torch.utils.data.datapipes.iter import (
Filter,
Mapper,
ZipArchiveReader,
Shuffler,
)
from torchvision.prototype.datasets.utils._internal import path_accessor, path_comparator
from torchvision.prototype.datasets.utils import (
Dataset,
DatasetConfig,
DatasetInfo,
HttpResource,
OnlineResource,
DatasetType,
)


class ucf101(Dataset):
"""This is a base datapipe that returns a file handler of the video.
What we want to do is implement either several decoder options or additional
datapipe extensions to make this work.
"""
def _make_info(self) -> DatasetInfo:
return DatasetInfo(
"ucf101",
type=DatasetType.VIDEO,
valid_options={'split': ["train", "test"], 'fold': ["1", "2", "3"]},
homepage="https://www.crcv.ucf.edu/data/UCF101.php",
)

def resources(self, config: DatasetConfig) -> List[OnlineResource]:
return [
HttpResource(
"https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip",
sha256="5c0d1a53b8ed364a2ac830a73f405e51bece7d98ce1254fd19ed4a36b224bd27",
),
HttpResource(
"https://www.crcv.ucf.edu/data/UCF101/UCF101.rar",
sha256="ca8dfadb4c891cb11316f94d52b6b0ac2a11994e67a0cae227180cd160bd8e55",
)
]

def _generate_categories(self, root: pathlib.Path) -> List[str]:
dp = self.resources(self.default_config)[1].to_datapipe(pathlib.Path(root) / self.name)
dp = RarArchiveReader(dp)
dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
return [name.split(".")[1] for name in sorted(dir_names)]

def _collate_and_decode(
self,
data: Tuple[Tuple[str, int], Tuple[str, io.IOBase]],
*,
decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]] = None,
) -> Dict[str, Any]:
annotations_d, file_d = data
label = annotations_d[1]
_path, file_handle = file_d
file = decoder(file_handle) if decoder else file_handle
return {"path": _path, "file": file, "target": label}

def _make_datapipe(
self,
resource_dps: List[IterDataPipe],
*,
config: DatasetConfig,
decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
) -> IterDataPipe[Dict[str, Any]]:

annotations = resource_dps[0]
files = resource_dps[1]

annotations_dp = ZipArchiveReader(annotations)
annotations_dp = Filter(annotations_dp, path_comparator("name", f"{config.split}list0{config.fold}.txt"))
annotations_dp = CSVParser(annotations_dp, delimiter=" ")
# COMMENT OUT FOR TESTING
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but should be removed before merge.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would really want our datasets to be deterministic outside of a DataLoader though. Making it stochastic will make it much harder to debug.

Maybe what we should do instead is have a new DataLoaderOnlyShuffler which is a no-op in general, and inside the dataloader it can be activated if shuffle is true.

Thoughts?

cc @VitalyFedyunin @ejguan

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

annotations_dp = Shuffler(annotations_dp, buffer_size=INFINITE_BUFFER_SIZE)

files_dp = RarArchiveReader(files)
dp = KeyZipper(annotations_dp, files_dp, path_accessor("name"))
return Mapper(dp, self._collate_and_decode, fn_kwargs=dict(decoder=decoder))
5 changes: 3 additions & 2 deletions torchvision/prototype/datasets/utils/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
class DatasetType(enum.Enum):
RAW = enum.auto()
IMAGE = enum.auto()
VIDEO = enum.auto()


class DatasetConfig(FrozenBunch):
Expand Down Expand Up @@ -151,7 +152,7 @@ def _make_datapipe(
resource_dps: List[IterDataPipe],
*,
config: DatasetConfig,
decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]],
) -> IterDataPipe[Dict[str, Any]]:
pass

Expand All @@ -163,7 +164,7 @@ def to_datapipe(
root: Union[str, pathlib.Path],
*,
config: Optional[DatasetConfig] = None,
decoder: Optional[Callable[[io.IOBase], torch.Tensor]] = None,
decoder: Optional[Callable[[io.IOBase], Dict[str, Any]]] = None,
) -> IterDataPipe[Dict[str, Any]]:
if not config:
config = self.info.default_config
Expand Down
33 changes: 33 additions & 0 deletions torchvision/prototype/datasets/utils/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
"path_accessor",
"path_comparator",
"Decompressor",
"RarArchiveReader",
]

K = TypeVar("K")
Expand Down Expand Up @@ -176,6 +177,38 @@ def __iter__(self) -> Iterator[Tuple[str, io.IOBase]]:
yield path, decompressor(file)


class RarArchiveReader(IterDataPipe[Tuple[str, io.BufferedIOBase]]):
def __init__(self, datapipe: IterDataPipe[Tuple[str, io.BufferedIOBase]]):
self._rarfile = self._verify_dependencies()
super().__init__()
self.datapipe = datapipe

@staticmethod
def _verify_dependencies():
try:
import rarfile
except ImportError as error:
raise ModuleNotFoundError(
"Package `rarfile` is required to be installed to use this datapipe. "
"Please use `pip install rarfile` or `conda -c conda-forge install rarfile` to install it."
) from error

# check if at least one system library for reading rar archives is available to be used by rarfile
rarfile.tool_setup()

return rarfile

def __iter__(self) -> Iterator[Tuple[str, io.BufferedIOBase]]:
for path, stream in self.datapipe:
rar = self._rarfile.RarFile(stream)
for info in rar.infolist():
if info.filename.endswith("/"):
continue

inner_path = os.path.join(path, info.filename)
file_obj = rar.open(info)

yield inner_path, file_obj
class PicklerDataPipe(IterDataPipe):
def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO[bytes]]]) -> None:
self.source_datapipe = source_datapipe
Expand Down
171 changes: 171 additions & 0 deletions torchvision/prototype/datasets/video_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from typing import Any, Dict, Iterator
import random
import av
import numpy as np
import torch
from torchdata.datapipes.iter import IterDataPipe
from torchvision.io import video, _video_opt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I would use _video_opt in here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.
Any particular reason why not?


class AVKeyframeReader(IterDataPipe[Dict[str, Any]]):
def __init__(self, video_dp: IterDataPipe[Dict[str, Any]]) -> None:
"""TorchData Iterdatapype that takes in video datapipe
and yields all keyframes in a video

Args:
video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
"""
self.datapipe = video_dp

def __iter__(self) -> Iterator[Dict[str, Any]]:
for video_d in self.datapipe:
buffer = video_d.pop("file")
with av.open(buffer, metadata_errors="ignore") as container:
stream = container.streams.video[0]
stream.codec_context.skip_frame = 'NONKEY'
for frame in container.decode(stream):
img = frame.to_image()
yield dict(
video_d,
frame=img,
pts=frame.pts,
video_meta={
"time_base": float(frame.time_base),
"guessed_fps": float(stream.guessed_rate),
})

class AVRandomFrameReader(IterDataPipe[Dict[str, Any]]):
def __init__(self, video_dp: IterDataPipe[Dict[str, Any]], num_samples=1, transform=None) -> None:
"""TorchData Iterdatapype that takes in video datapipe
and yields `num_samples` random frames from a video.

Args:
video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
num_samples (int, optional): Number of frames to sample from each video. Defaults to 1.
"""
self.datapipe = video_dp
self.num_samples = num_samples

def __iter__(self) -> Iterator[Dict[str, Any]]:
for video_d in self.datapipe:
buffer = video_d.pop("file")
with av.open(buffer, metadata_errors="ignore") as container:
stream = container.streams.video[0]
# duration is given in time_base units as int
duration = stream.duration
# seek to a random frame
seek_idxs = random.sample(list(range(duration)), self.num_samples)
for i in seek_idxs:
container.seek(i, any_frame=True, stream=stream)
frame = next(container.decode(stream))
img = frame.to_image()

video_meta = {"time_base": float(frame.time_base),
"guessed_fps": float(stream.guessed_rate)}

yield dict(video_d, frame=img, pts=frame.pts, video_meta=video_meta)

class AVClipReader(IterDataPipe[Dict[str, Any]]):
def __init__(
self,
video_dp: IterDataPipe[Dict[str, Any]],
num_frames_per_clip=8,
num_clips_per_video=1,
step_between_clips=1) -> None:
"""TorchData Iterdatapype that takes in video datapipe
and yields `num_clips_per_video` video clips (sequences of `num_frames_per_clip` frames) from a video.
Clips are sampled from all possible clips of length `num_frames_per_clip` spaced `step_between_clips` apart.

Args:
video_dp (IterDataPipe[Dict[str, Any]]): Video dataset IterDataPipe
num_frames_per_clip (int, optional): Length of a video clip in frames. Defaults to 8.
num_clips_per_video (int, optional): How many clips to sample from each video. Defaults to 1.
step_between_clips (int, optional): Minimum spacing between two clips. Defaults to 1.
"""

self.datapipe = video_dp
self.num_frames_per_clip = num_frames_per_clip
self.num_clips_per_video = num_clips_per_video
self.step_between_clips = step_between_clips

def _unfold(self, tensor, dilation=1):
"""
similar to tensor.unfold, but with the dilation
and specialized for 1d tensors
Returns all consecutive windows of `self.num_frames_per_clip` elements, with
`self.step_between_clips` between windows. The distance between each element
in a window is given by `dilation`.
"""
assert tensor.dim() == 1
o_stride = tensor.stride(0)
numel = tensor.numel()
new_stride = (self.step_between_clips * o_stride, dilation * o_stride)
new_size = ((numel - (dilation * (self.num_frames_per_clip - 1) + 1)) // self.step_between_clips + 1,
self.num_frames_per_clip)
if new_size[0] < 1:
new_size = (0, self.num_frames_per_clip)
return torch.as_strided(tensor, new_size, new_stride)

def __iter__(self) -> Iterator[Dict[str, Any]]:
for video_d in self.datapipe:
buffer = video_d["file"]
with av.open(buffer, metadata_errors="ignore") as container:
stream = container.streams.video[0]
time_base = stream.time_base

# duration is given in time_base units as int
duration = stream.duration

# get video_stream timestramps
# with a tolerance for pyav imprecission
_ptss = torch.arange(duration - 7)
_ptss = self._unfold(_ptss)
# shuffle the clips
perm = torch.randperm(_ptss.size(0))
idx = perm[:self.num_clips_per_video]
samples = _ptss[idx]

for clip_pts in samples:
start_pts = clip_pts[0].item()
end_pts = clip_pts[-1].item()
# video_timebase is the default time_base
pts_unit = "pts"
start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(start_pts, end_pts, "pts", time_base)
video_frames = video._read_from_stream(
container,
start_pts,
end_pts,
pts_unit,
stream,
{"video": 0},
)

vframes_list = [frame.to_ndarray(format='rgb24') for frame in video_frames]

if vframes_list:
vframes = torch.as_tensor(np.stack(vframes_list))
# account for rounding errors in conversion
# FIXME: fix this in the code
vframes = vframes[:self.num_frames_per_clip, ...]

else:
vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
print("FAIL")

# [N,H,W,C] to [N,C,H,W]
vframes = vframes.permute(0, 3, 1, 2)
assert(vframes.size(0) == self.num_frames_per_clip)

# TODO: support sampling rates (FPS change)
# TODO: optimization (read all and select)

yield {
"clip": vframes,
"pts": clip_pts,
"range": (start_pts, end_pts),
"video_meta": {
"time_base": float(stream.time_base),
"guessed_fps": float(stream.guessed_rate),
},
"path": video_d["path"],
"target": video_d["target"]
}