Skip to content

Commit f042027

Browse files
committed
Merge branch 'master' into bugfix/pyramidnet_init
2 parents 050b64a + 32e5700 commit f042027

File tree

11 files changed

+116
-28
lines changed

11 files changed

+116
-28
lines changed
Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
# Video Classification
22

3-
TODO: Add some info about the context, dataset we use etc
3+
We present a simple training script that can be used for replicating the result of [resenet-based video models](https://research.fb.com/wp-content/uploads/2018/04/a-closer-look-at-spatiotemporal-convolutions-for-action-recognition.pdf). All models are trained on [Kinetics400 dataset](https://deepmind.com/research/open-source/kinetics), a benchmark dataset for human-action recognition. The accuracy is reported on the traditional validation split.
44

55
## Data preparation
66

77
If you already have downloaded [Kinetics400 dataset](https://deepmind.com/research/open-source/kinetics),
88
please proceed directly to the next section.
99

10-
To download videos, one can use https://github.com/Showmax/kinetics-downloader
10+
To download videos, one can use https://github.com/Showmax/kinetics-downloader. Please note that the dataset can take up upwards of 400GB, depending on the quality setting during download.
1111

1212
## Training
1313

1414
We assume the training and validation AVI videos are stored at `/data/kinectics400/train` and
15-
`/data/kinectics400/val`.
15+
`/data/kinectics400/val`. For training we suggest starting with the hyperparameters reported in the [paper](https://research.fb.com/wp-content/uploads/2018/04/a-closer-look-at-spatiotemporal-convolutions-for-action-recognition.pdf), in order to match the performance of said models. Clip sampling strategy is a particularly important parameter during training, and we suggest using random temporal jittering during training - in other words sampling multiple training clips from each video with random start times during at every epoch. This functionality is built into our training script, and optimal hyperparameters are set by default.
1616

1717
### Multiple GPUs
1818

@@ -21,7 +21,8 @@ Run the training on a single node with 8 GPUs:
2121
python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --apex
2222
```
2323

24-
24+
**Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
25+
**Note 2:** hyperparameters for exact replication of our training can be found [here](https://github.com/pytorch/vision/blob/master/torchvision/models/video/README.md). Some hyperparameters such as learning rate are scaled linearly in proportion to the number of GPUs.
2526

2627
### Single GPU
2728

@@ -30,6 +31,4 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-
3031

3132
```bash
3233
python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
33-
```
34-
35-
34+
```

test/test_datasets_download.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ def make_parametrize_kwargs(download_configs):
194194
caltech256(),
195195
cifar10(),
196196
cifar100(),
197-
voc(),
197+
# The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
198+
# voc(),
198199
)
199200
)
200201
)

test/test_image.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,18 @@ def test_read_file(self):
221221
RuntimeError, "No such file or directory: 'tst'"):
222222
read_file('tst')
223223

224+
def test_read_file_non_ascii(self):
225+
with get_tmp_dir() as d:
226+
fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
227+
fpath = os.path.join(d, fname)
228+
with open(fpath, 'wb') as f:
229+
f.write(content)
230+
231+
data = read_file(fpath)
232+
expected = torch.tensor(list(content), dtype=torch.uint8)
233+
self.assertTrue(data.equal(expected))
234+
os.unlink(fpath)
235+
224236
def test_write_file(self):
225237
with get_tmp_dir() as d:
226238
fname, content = 'test1.bin', b'TorchVision\211\n'
@@ -233,6 +245,18 @@ def test_write_file(self):
233245
self.assertEqual(content, saved_content)
234246
os.unlink(fpath)
235247

248+
def test_write_file_non_ascii(self):
249+
with get_tmp_dir() as d:
250+
fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
251+
fpath = os.path.join(d, fname)
252+
content_tensor = torch.tensor(list(content), dtype=torch.uint8)
253+
write_file(fpath, content_tensor)
254+
255+
with open(fpath, 'rb') as f:
256+
saved_content = f.read()
257+
self.assertEqual(content, saved_content)
258+
os.unlink(fpath)
259+
236260

237261
if __name__ == '__main__':
238262
unittest.main()

test/test_models.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import unittest
99
import random
1010

11-
from torchvision.ops.misc import FrozenBatchNorm2d
11+
from torchvision.models.detection._utils import overwrite_eps
1212

1313

1414
def set_rng_seed(seed):
@@ -151,9 +151,7 @@ def _test_detection_model(self, name, dev):
151151
kwargs["score_thresh"] = 0.013
152152
model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False, **kwargs)
153153
if "keypointrcnn" in name or "retinanet" in name:
154-
for module in model.modules():
155-
if isinstance(module, FrozenBatchNorm2d):
156-
module.eps = 0
154+
overwrite_eps(model, 0.0)
157155
model.eval().to(device=dev)
158156
input_shape = (3, 300, 300)
159157
# RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests

torchvision/csrc/cpu/image/read_write_file_cpu.cpp

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,40 @@
11
#include "read_write_file_cpu.h"
22

3-
// According to
4-
// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
5-
// we should use _stat64 for 64-bit file size on Windows.
63
#ifdef _WIN32
7-
#define VISION_STAT _stat64
8-
#else
9-
#define VISION_STAT stat
4+
#define WIN32_LEAN_AND_MEAN
5+
#include <Windows.h>
6+
7+
std::wstring utf8_decode(const std::string& str) {
8+
if (str.empty()) {
9+
return std::wstring();
10+
}
11+
int size_needed = MultiByteToWideChar(
12+
CP_UTF8, 0, str.c_str(), static_cast<int>(str.size()), NULL, 0);
13+
TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode");
14+
std::wstring wstrTo(size_needed, 0);
15+
MultiByteToWideChar(
16+
CP_UTF8,
17+
0,
18+
str.c_str(),
19+
static_cast<int>(str.size()),
20+
&wstrTo[0],
21+
size_needed);
22+
return wstrTo;
23+
}
1024
#endif
1125

12-
torch::Tensor read_file(std::string filename) {
13-
struct VISION_STAT stat_buf;
14-
int rc = VISION_STAT(filename.c_str(), &stat_buf);
26+
torch::Tensor read_file(const std::string& filename) {
27+
#ifdef _WIN32
28+
// According to
29+
// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
30+
// we should use struct __stat64 and _wstat64 for 64-bit file size on Windows.
31+
struct __stat64 stat_buf;
32+
auto fileW = utf8_decode(filename);
33+
int rc = _wstat64(fileW.c_str(), &stat_buf);
34+
#else
35+
struct stat stat_buf;
36+
int rc = stat(filename.c_str(), &stat_buf);
37+
#endif
1538
// errno is a variable defined in errno.h
1639
TORCH_CHECK(
1740
rc == 0, "[Errno ", errno, "] ", strerror(errno), ": '", filename, "'");
@@ -21,9 +44,20 @@ torch::Tensor read_file(std::string filename) {
2144
TORCH_CHECK(size > 0, "Expected a non empty file");
2245

2346
#ifdef _WIN32
24-
auto data =
25-
torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8)
26-
.clone();
47+
// TODO: Once torch::from_file handles UTF-8 paths correctly, we should move
48+
// back to use the following implementation since it uses file mapping.
49+
// auto data =
50+
// torch::from_file(filename, /*shared=*/false, /*size=*/size,
51+
// torch::kU8).clone()
52+
FILE* infile = _wfopen(fileW.c_str(), L"rb");
53+
54+
TORCH_CHECK(infile != nullptr, "Error opening input file");
55+
56+
auto data = torch::empty({size}, torch::kU8);
57+
auto dataBytes = data.data_ptr<uint8_t>();
58+
59+
fread(dataBytes, sizeof(uint8_t), size, infile);
60+
fclose(infile);
2761
#else
2862
auto data =
2963
torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8);
@@ -32,7 +66,7 @@ torch::Tensor read_file(std::string filename) {
3266
return data;
3367
}
3468

35-
void write_file(std::string filename, torch::Tensor& data) {
69+
void write_file(const std::string& filename, torch::Tensor& data) {
3670
// Check that the input tensor is on CPU
3771
TORCH_CHECK(data.device() == torch::kCPU, "Input tensor should be on CPU");
3872

@@ -44,7 +78,12 @@ void write_file(std::string filename, torch::Tensor& data) {
4478

4579
auto fileBytes = data.data_ptr<uint8_t>();
4680
auto fileCStr = filename.c_str();
81+
#ifdef _WIN32
82+
auto fileW = utf8_decode(filename);
83+
FILE* outfile = _wfopen(fileW.c_str(), L"wb");
84+
#else
4785
FILE* outfile = fopen(fileCStr, "wb");
86+
#endif
4887

4988
TORCH_CHECK(outfile != nullptr, "Error opening output file");
5089

torchvision/csrc/cpu/image/read_write_file_cpu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44
#include <sys/stat.h>
55
#include <torch/torch.h>
66

7-
C10_EXPORT torch::Tensor read_file(std::string filename);
7+
C10_EXPORT torch::Tensor read_file(const std::string& filename);
88

9-
C10_EXPORT void write_file(std::string filename, torch::Tensor& data);
9+
C10_EXPORT void write_file(const std::string& filename, torch::Tensor& data);

torchvision/models/detection/_utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import torch
44
from torch.jit.annotations import List, Tuple
55
from torch import Tensor
6-
import torchvision
6+
7+
from torchvision.ops.misc import FrozenBatchNorm2d
78

89

910
class BalancedPositiveNegativeSampler(object):
@@ -349,3 +350,21 @@ def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = Tru
349350
if size_average:
350351
return loss.mean()
351352
return loss.sum()
353+
354+
355+
def overwrite_eps(model, eps):
356+
"""
357+
This method overwrites the default eps values of all the
358+
FrozenBatchNorm2d layers of the model with the provided value.
359+
This is necessary to address the BC-breaking change introduced
360+
by the bug-fix at pytorch/vision#2933. The overwrite is applied
361+
only when the pretrained weights are loaded to maintain compatibility
362+
with previous versions.
363+
364+
Arguments:
365+
model (nn.Module): The model on which we perform the overwrite.
366+
eps (float): The new value of eps.
367+
"""
368+
for module in model.modules():
369+
if isinstance(module, FrozenBatchNorm2d):
370+
module.eps = eps

torchvision/models/detection/faster_rcnn.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from torchvision.ops import misc as misc_nn_ops
88
from torchvision.ops import MultiScaleRoIAlign
99

10+
from ._utils import overwrite_eps
1011
from ..utils import load_state_dict_from_url
1112

1213
from .anchor_utils import AnchorGenerator
@@ -361,4 +362,5 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
361362
state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'],
362363
progress=progress)
363364
model.load_state_dict(state_dict)
365+
overwrite_eps(model, 0.0)
364366
return model

torchvision/models/detection/keypoint_rcnn.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from torchvision.ops import MultiScaleRoIAlign
55

6+
from ._utils import overwrite_eps
67
from ..utils import load_state_dict_from_url
78

89
from .faster_rcnn import FasterRCNN
@@ -332,4 +333,5 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
332333
state_dict = load_state_dict_from_url(model_urls[key],
333334
progress=progress)
334335
model.load_state_dict(state_dict)
336+
overwrite_eps(model, 0.0)
335337
return model

torchvision/models/detection/mask_rcnn.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from torchvision.ops import misc as misc_nn_ops
88
from torchvision.ops import MultiScaleRoIAlign
99

10+
from ._utils import overwrite_eps
1011
from ..utils import load_state_dict_from_url
1112

1213
from .faster_rcnn import FasterRCNN
@@ -328,4 +329,5 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
328329
state_dict = load_state_dict_from_url(model_urls['maskrcnn_resnet50_fpn_coco'],
329330
progress=progress)
330331
model.load_state_dict(state_dict)
332+
overwrite_eps(model, 0.0)
331333
return model

torchvision/models/detection/retinanet.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from torch import Tensor
88
from torch.jit.annotations import Dict, List, Tuple
99

10+
from ._utils import overwrite_eps
1011
from ..utils import load_state_dict_from_url
1112

1213
from . import _utils as det_utils
@@ -628,4 +629,5 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
628629
state_dict = load_state_dict_from_url(model_urls['retinanet_resnet50_fpn_coco'],
629630
progress=progress)
630631
model.load_state_dict(state_dict)
632+
overwrite_eps(model, 0.0)
631633
return model

0 commit comments

Comments
 (0)