|
| 1 | +import gzip |
| 2 | +import pathlib |
1 | 3 | import sys
|
2 | 4 |
|
3 | 5 | import numpy as np
|
4 | 6 | import pytest
|
5 | 7 | import torch
|
6 |
| -from datasets_utils import make_fake_flo_file |
| 8 | +from datasets_utils import make_fake_flo_file, make_tar |
| 9 | +from torchdata.datapipes.iter import FileOpener, TarArchiveLoader |
7 | 10 | from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
|
8 |
| -from torchvision.prototype.datasets.utils import HttpResource, GDriveResource, Dataset |
| 11 | +from torchvision.datasets.utils import _decompress |
| 12 | +from torchvision.prototype.datasets.utils import HttpResource, GDriveResource, Dataset, OnlineResource |
9 | 13 | from torchvision.prototype.datasets.utils._internal import read_flo, fromfile
|
10 | 14 |
|
11 | 15 |
|
@@ -48,6 +52,183 @@ def test_read_flo(tmpdir):
|
48 | 52 | torch.testing.assert_close(actual, expected)
|
49 | 53 |
|
50 | 54 |
|
| 55 | +class TestOnlineResource: |
| 56 | + class DummyResource(OnlineResource): |
| 57 | + def __init__(self, download_fn=None, **kwargs): |
| 58 | + super().__init__(**kwargs) |
| 59 | + self._download_fn = download_fn |
| 60 | + |
| 61 | + def _download(self, root): |
| 62 | + if self._download_fn is None: |
| 63 | + raise pytest.UsageError( |
| 64 | + "`_download()` was called, but `DummyResource(...)` was constructed without `download_fn`." |
| 65 | + ) |
| 66 | + |
| 67 | + return self._download_fn(self, root) |
| 68 | + |
| 69 | + def _make_file(self, root, *, content, name="file.txt"): |
| 70 | + file = root / name |
| 71 | + with open(file, "w") as fh: |
| 72 | + fh.write(content) |
| 73 | + |
| 74 | + return file |
| 75 | + |
| 76 | + def _make_folder(self, root, *, name="folder"): |
| 77 | + folder = root / name |
| 78 | + subfolder = folder / "subfolder" |
| 79 | + subfolder.mkdir(parents=True) |
| 80 | + |
| 81 | + files = {} |
| 82 | + for idx, root in enumerate([folder, folder, subfolder]): |
| 83 | + content = f"sentinel{idx}" |
| 84 | + file = self._make_file(root, name=f"file{idx}.txt", content=content) |
| 85 | + files[str(file)] = content |
| 86 | + |
| 87 | + return folder, files |
| 88 | + |
| 89 | + def _make_tar(self, root, *, name="archive.tar", remove=True): |
| 90 | + folder, files = self._make_folder(root, name=name.split(".")[0]) |
| 91 | + archive = make_tar(root, name, folder, remove=remove) |
| 92 | + files = {str(archive / pathlib.Path(file).relative_to(root)): content for file, content in files.items()} |
| 93 | + return archive, files |
| 94 | + |
| 95 | + def test_load_file(self, tmp_path): |
| 96 | + content = "sentinel" |
| 97 | + file = self._make_file(tmp_path, content=content) |
| 98 | + |
| 99 | + resource = self.DummyResource(file_name=file.name) |
| 100 | + |
| 101 | + dp = resource.load(tmp_path) |
| 102 | + assert isinstance(dp, FileOpener) |
| 103 | + |
| 104 | + data = list(dp) |
| 105 | + assert len(data) == 1 |
| 106 | + |
| 107 | + path, buffer = data[0] |
| 108 | + assert path == str(file) |
| 109 | + assert buffer.read().decode() == content |
| 110 | + |
| 111 | + def test_load_folder(self, tmp_path): |
| 112 | + folder, files = self._make_folder(tmp_path) |
| 113 | + |
| 114 | + resource = self.DummyResource(file_name=folder.name) |
| 115 | + |
| 116 | + dp = resource.load(tmp_path) |
| 117 | + assert isinstance(dp, FileOpener) |
| 118 | + assert {path: buffer.read().decode() for path, buffer in dp} == files |
| 119 | + |
| 120 | + def test_load_archive(self, tmp_path): |
| 121 | + archive, files = self._make_tar(tmp_path) |
| 122 | + |
| 123 | + resource = self.DummyResource(file_name=archive.name) |
| 124 | + |
| 125 | + dp = resource.load(tmp_path) |
| 126 | + assert isinstance(dp, TarArchiveLoader) |
| 127 | + assert {path: buffer.read().decode() for path, buffer in dp} == files |
| 128 | + |
| 129 | + def test_priority_decompressed_gt_raw(self, tmp_path): |
| 130 | + # We don't need to actually compress here. Adding the suffix is sufficient |
| 131 | + self._make_file(tmp_path, content="raw_sentinel", name="file.txt.gz") |
| 132 | + file = self._make_file(tmp_path, content="decompressed_sentinel", name="file.txt") |
| 133 | + |
| 134 | + resource = self.DummyResource(file_name=file.name) |
| 135 | + |
| 136 | + dp = resource.load(tmp_path) |
| 137 | + path, buffer = next(iter(dp)) |
| 138 | + |
| 139 | + assert path == str(file) |
| 140 | + assert buffer.read().decode() == "decompressed_sentinel" |
| 141 | + |
| 142 | + def test_priority_extracted_gt_decompressed(self, tmp_path): |
| 143 | + archive, _ = self._make_tar(tmp_path, remove=False) |
| 144 | + |
| 145 | + resource = self.DummyResource(file_name=archive.name) |
| 146 | + |
| 147 | + dp = resource.load(tmp_path) |
| 148 | + # If the archive had been selected, this would be a `TarArchiveReader` |
| 149 | + assert isinstance(dp, FileOpener) |
| 150 | + |
| 151 | + def test_download(self, tmp_path): |
| 152 | + download_fn_was_called = False |
| 153 | + |
| 154 | + def download_fn(resource, root): |
| 155 | + nonlocal download_fn_was_called |
| 156 | + download_fn_was_called = True |
| 157 | + |
| 158 | + return self._make_file(root, content="_", name=resource.file_name) |
| 159 | + |
| 160 | + resource = self.DummyResource( |
| 161 | + file_name="file.txt", |
| 162 | + download_fn=download_fn, |
| 163 | + ) |
| 164 | + |
| 165 | + resource.load(tmp_path) |
| 166 | + |
| 167 | + assert download_fn_was_called, "`download_fn()` was never called" |
| 168 | + |
| 169 | + # This tests the `"decompress"` literal as well as a custom callable |
| 170 | + @pytest.mark.parametrize( |
| 171 | + "preprocess", |
| 172 | + [ |
| 173 | + "decompress", |
| 174 | + lambda path: _decompress(str(path), remove_finished=True), |
| 175 | + ], |
| 176 | + ) |
| 177 | + def test_preprocess_decompress(self, tmp_path, preprocess): |
| 178 | + file_name = "file.txt.gz" |
| 179 | + content = "sentinel" |
| 180 | + |
| 181 | + def download_fn(resource, root): |
| 182 | + file = root / resource.file_name |
| 183 | + with gzip.open(file, "wb") as fh: |
| 184 | + fh.write(content.encode()) |
| 185 | + return file |
| 186 | + |
| 187 | + resource = self.DummyResource(file_name=file_name, preprocess=preprocess, download_fn=download_fn) |
| 188 | + |
| 189 | + dp = resource.load(tmp_path) |
| 190 | + data = list(dp) |
| 191 | + assert len(data) == 1 |
| 192 | + |
| 193 | + path, buffer = data[0] |
| 194 | + assert path == str(tmp_path / file_name).replace(".gz", "") |
| 195 | + assert buffer.read().decode() == content |
| 196 | + |
| 197 | + def test_preprocess_extract(self, tmp_path): |
| 198 | + files = None |
| 199 | + |
| 200 | + def download_fn(resource, root): |
| 201 | + nonlocal files |
| 202 | + archive, files = self._make_tar(root, name=resource.file_name) |
| 203 | + return archive |
| 204 | + |
| 205 | + resource = self.DummyResource(file_name="folder.tar", preprocess="extract", download_fn=download_fn) |
| 206 | + |
| 207 | + dp = resource.load(tmp_path) |
| 208 | + assert files is not None, "`download_fn()` was never called" |
| 209 | + assert isinstance(dp, FileOpener) |
| 210 | + |
| 211 | + actual = {path: buffer.read().decode() for path, buffer in dp} |
| 212 | + expected = { |
| 213 | + path.replace(resource.file_name, resource.file_name.split(".")[0]): content |
| 214 | + for path, content in files.items() |
| 215 | + } |
| 216 | + assert actual == expected |
| 217 | + |
| 218 | + def test_preprocess_only_after_download(self, tmp_path): |
| 219 | + file = self._make_file(tmp_path, content="_") |
| 220 | + |
| 221 | + def preprocess(path): |
| 222 | + raise AssertionError("`preprocess` was called although the file was already present.") |
| 223 | + |
| 224 | + resource = self.DummyResource( |
| 225 | + file_name=file.name, |
| 226 | + preprocess=preprocess, |
| 227 | + ) |
| 228 | + |
| 229 | + resource.load(tmp_path) |
| 230 | + |
| 231 | + |
51 | 232 | class TestHttpResource:
|
52 | 233 | def test_resolve_to_http(self, mocker):
|
53 | 234 | file_name = "data.tar"
|
|
0 commit comments