Merge branch 'master' into bugfix/pyramidnet_init

datumbox · datumbox · commit f0420277729c · 2020-11-05T11:19:02.000Z
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
@@ -1,18 +1,18 @@
 # Video Classification
 
-TODO: Add some info about the context, dataset we use etc
+We present a simple training script that can be used for replicating the result of [resenet-based video models](https://research.fb.com/wp-content/uploads/2018/04/a-closer-look-at-spatiotemporal-convolutions-for-action-recognition.pdf). All models are trained on [Kinetics400 dataset](https://deepmind.com/research/open-source/kinetics), a benchmark dataset for human-action recognition. The accuracy is reported on the traditional validation split.
 
 ## Data preparation
 
 If you already have downloaded [Kinetics400 dataset](https://deepmind.com/research/open-source/kinetics), 
 please proceed directly to the next section.
 
-To download videos, one can use https://github.com/Showmax/kinetics-downloader
+To download videos, one can use https://github.com/Showmax/kinetics-downloader. Please note that the dataset can take up upwards of 400GB, depending on the quality setting during download.
 
 ## Training
 
 We assume the training and validation AVI videos are stored at `/data/kinectics400/train` and 
-`/data/kinectics400/val`. 
+`/data/kinectics400/val`. For training we suggest starting with the hyperparameters reported in the [paper](https://research.fb.com/wp-content/uploads/2018/04/a-closer-look-at-spatiotemporal-convolutions-for-action-recognition.pdf), in order to match the performance of said models. Clip sampling strategy is a particularly important parameter during training, and we suggest using random temporal jittering during training - in other words sampling multiple training clips from each video with random start times during at every epoch. This functionality is built into our training script, and optimal hyperparameters are set by default.  
 
 ### Multiple GPUs
 
@@ -21,7 +21,8 @@ Run the training on a single node with 8 GPUs:
 python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --apex
 ```
 
-
+**Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
+**Note 2:** hyperparameters for exact replication of our training can be found [here](https://github.com/pytorch/vision/blob/master/torchvision/models/video/README.md). Some hyperparameters such as learning rate are scaled linearly in proportion to the number of GPUs.
 
 ### Single GPU 
 
@@ -30,6 +31,4 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-
 
 ```bash
 python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
-```
-
-
+```
diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
@@ -194,7 +194,8 @@ def make_parametrize_kwargs(download_configs):
             caltech256(),
             cifar10(),
             cifar100(),
-            voc(),
+            # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
+            # voc(),
         )
     )
 )
diff --git a/test/test_image.py b/test/test_image.py
@@ -221,6 +221,18 @@ def test_read_file(self):
                 RuntimeError, "No such file or directory: 'tst'"):
             read_file('tst')
 
+    def test_read_file_non_ascii(self):
+        with get_tmp_dir() as d:
+            fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
+            fpath = os.path.join(d, fname)
+            with open(fpath, 'wb') as f:
+                f.write(content)
+
+            data = read_file(fpath)
+            expected = torch.tensor(list(content), dtype=torch.uint8)
+            self.assertTrue(data.equal(expected))
+            os.unlink(fpath)
+
     def test_write_file(self):
         with get_tmp_dir() as d:
             fname, content = 'test1.bin', b'TorchVision\211\n'
@@ -233,6 +245,18 @@ def test_write_file(self):
             self.assertEqual(content, saved_content)
             os.unlink(fpath)
 
+    def test_write_file_non_ascii(self):
+        with get_tmp_dir() as d:
+            fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
+            fpath = os.path.join(d, fname)
+            content_tensor = torch.tensor(list(content), dtype=torch.uint8)
+            write_file(fpath, content_tensor)
+
+            with open(fpath, 'rb') as f:
+                saved_content = f.read()
+            self.assertEqual(content, saved_content)
+            os.unlink(fpath)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_models.py b/test/test_models.py
@@ -8,7 +8,7 @@
 import unittest
 import random
 
-from torchvision.ops.misc import FrozenBatchNorm2d
+from torchvision.models.detection._utils import overwrite_eps
 
 
 def set_rng_seed(seed):
@@ -151,9 +151,7 @@ def _test_detection_model(self, name, dev):
             kwargs["score_thresh"] = 0.013
         model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False, **kwargs)
         if "keypointrcnn" in name or "retinanet" in name:
-            for module in model.modules():
-                if isinstance(module, FrozenBatchNorm2d):
-                    module.eps = 0
+            overwrite_eps(model, 0.0)
         model.eval().to(device=dev)
         input_shape = (3, 300, 300)
         # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
diff --git a/torchvision/csrc/cpu/image/read_write_file_cpu.cpp b/torchvision/csrc/cpu/image/read_write_file_cpu.cpp
@@ -1,17 +1,40 @@
 #include "read_write_file_cpu.h"
 
-// According to
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
-// we should use _stat64 for 64-bit file size on Windows.
 #ifdef _WIN32
-#define VISION_STAT _stat64
-#else
-#define VISION_STAT stat
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+
+std::wstring utf8_decode(const std::string& str) {
+  if (str.empty()) {
+    return std::wstring();
+  }
+  int size_needed = MultiByteToWideChar(
+      CP_UTF8, 0, str.c_str(), static_cast<int>(str.size()), NULL, 0);
+  TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode");
+  std::wstring wstrTo(size_needed, 0);
+  MultiByteToWideChar(
+      CP_UTF8,
+      0,
+      str.c_str(),
+      static_cast<int>(str.size()),
+      &wstrTo[0],
+      size_needed);
+  return wstrTo;
+}
 #endif
 
-torch::Tensor read_file(std::string filename) {
-  struct VISION_STAT stat_buf;
-  int rc = VISION_STAT(filename.c_str(), &stat_buf);
+torch::Tensor read_file(const std::string& filename) {
+#ifdef _WIN32
+  // According to
+  // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
+  // we should use struct __stat64 and _wstat64 for 64-bit file size on Windows.
+  struct __stat64 stat_buf;
+  auto fileW = utf8_decode(filename);
+  int rc = _wstat64(fileW.c_str(), &stat_buf);
+#else
+  struct stat stat_buf;
+  int rc = stat(filename.c_str(), &stat_buf);
+#endif
   // errno is a variable defined in errno.h
   TORCH_CHECK(
       rc == 0, "[Errno ", errno, "] ", strerror(errno), ": '", filename, "'");
@@ -21,9 +44,20 @@ torch::Tensor read_file(std::string filename) {
   TORCH_CHECK(size > 0, "Expected a non empty file");
 
 #ifdef _WIN32
-  auto data =
-      torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8)
-          .clone();
+  // TODO: Once torch::from_file handles UTF-8 paths correctly, we should move
+  // back to use the following implementation since it uses file mapping.
+  //   auto data =
+  //       torch::from_file(filename, /*shared=*/false, /*size=*/size,
+  //       torch::kU8).clone()
+  FILE* infile = _wfopen(fileW.c_str(), L"rb");
+
+  TORCH_CHECK(infile != nullptr, "Error opening input file");
+
+  auto data = torch::empty({size}, torch::kU8);
+  auto dataBytes = data.data_ptr<uint8_t>();
+
+  fread(dataBytes, sizeof(uint8_t), size, infile);
+  fclose(infile);
 #else
   auto data =
       torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8);
@@ -32,7 +66,7 @@ torch::Tensor read_file(std::string filename) {
   return data;
 }
 
-void write_file(std::string filename, torch::Tensor& data) {
+void write_file(const std::string& filename, torch::Tensor& data) {
   // Check that the input tensor is on CPU
   TORCH_CHECK(data.device() == torch::kCPU, "Input tensor should be on CPU");
 
@@ -44,7 +78,12 @@ void write_file(std::string filename, torch::Tensor& data) {
 
   auto fileBytes = data.data_ptr<uint8_t>();
   auto fileCStr = filename.c_str();
+#ifdef _WIN32
+  auto fileW = utf8_decode(filename);
+  FILE* outfile = _wfopen(fileW.c_str(), L"wb");
+#else
   FILE* outfile = fopen(fileCStr, "wb");
+#endif
 
   TORCH_CHECK(outfile != nullptr, "Error opening output file");
 
diff --git a/torchvision/csrc/cpu/image/read_write_file_cpu.h b/torchvision/csrc/cpu/image/read_write_file_cpu.h
@@ -4,6 +4,6 @@
 #include <sys/stat.h>
 #include <torch/torch.h>
 
-C10_EXPORT torch::Tensor read_file(std::string filename);
+C10_EXPORT torch::Tensor read_file(const std::string& filename);
 
-C10_EXPORT void write_file(std::string filename, torch::Tensor& data);
+C10_EXPORT void write_file(const std::string& filename, torch::Tensor& data);
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
@@ -3,7 +3,8 @@
 import torch
 from torch.jit.annotations import List, Tuple
 from torch import Tensor
-import torchvision
+
+from torchvision.ops.misc import FrozenBatchNorm2d
 
 
 class BalancedPositiveNegativeSampler(object):
@@ -349,3 +350,21 @@ def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = Tru
     if size_average:
         return loss.mean()
     return loss.sum()
+
+
+def overwrite_eps(model, eps):
+    """
+    This method overwrites the default eps values of all the
+    FrozenBatchNorm2d layers of the model with the provided value.
+    This is necessary to address the BC-breaking change introduced
+    by the bug-fix at pytorch/vision#2933. The overwrite is applied
+    only when the pretrained weights are loaded to maintain compatibility
+    with previous versions.
+
+    Arguments:
+        model (nn.Module): The model on which we perform the overwrite.
+        eps (float): The new value of eps.
+    """
+    for module in model.modules():
+        if isinstance(module, FrozenBatchNorm2d):
+            module.eps = eps
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
@@ -7,6 +7,7 @@
 from torchvision.ops import misc as misc_nn_ops
 from torchvision.ops import MultiScaleRoIAlign
 
+from ._utils import overwrite_eps
 from ..utils import load_state_dict_from_url
 
 from .anchor_utils import AnchorGenerator
@@ -361,4 +362,5 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
         state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'],
                                               progress=progress)
         model.load_state_dict(state_dict)
+        overwrite_eps(model, 0.0)
     return model
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
@@ -3,6 +3,7 @@
 
 from torchvision.ops import MultiScaleRoIAlign
 
+from ._utils import overwrite_eps
 from ..utils import load_state_dict_from_url
 
 from .faster_rcnn import FasterRCNN
@@ -332,4 +333,5 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
         state_dict = load_state_dict_from_url(model_urls[key],
                                               progress=progress)
         model.load_state_dict(state_dict)
+        overwrite_eps(model, 0.0)
     return model
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
@@ -7,6 +7,7 @@
 from torchvision.ops import misc as misc_nn_ops
 from torchvision.ops import MultiScaleRoIAlign
 
+from ._utils import overwrite_eps
 from ..utils import load_state_dict_from_url
 
 from .faster_rcnn import FasterRCNN
@@ -328,4 +329,5 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
         state_dict = load_state_dict_from_url(model_urls['maskrcnn_resnet50_fpn_coco'],
                                               progress=progress)
         model.load_state_dict(state_dict)
+        overwrite_eps(model, 0.0)
     return model
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
@@ -7,6 +7,7 @@
 from torch import Tensor
 from torch.jit.annotations import Dict, List, Tuple
 
+from ._utils import overwrite_eps
 from ..utils import load_state_dict_from_url
 
 from . import _utils as det_utils
@@ -628,4 +629,5 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
         state_dict = load_state_dict_from_url(model_urls['retinanet_resnet50_fpn_coco'],
                                               progress=progress)
         model.load_state_dict(state_dict)
+        overwrite_eps(model, 0.0)
     return model

Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,8 @@ def make_parametrize_kwargs(download_configs):`
`194`	`194`	`caltech256(),`
`195`	`195`	`cifar10(),`
`196`	`196`	`cifar100(),`
`197`		`- voc(),`
	`197`	`+ # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.`
	`198`	`+ # voc(),`
`198`	`199`	`)`
`199`	`200`	`)`
`200`	`201`	`)`