Merge pull request pytorch#3 from sprt/tpu_compat

sprt · web-flow · commit 0c464f69dc6a · 2019-12-09T15:17:02.000-08:00
Mask R-CNN on TPU
diff --git a/references/detection/train_tpu.py b/references/detection/train_tpu.py
@@ -133,6 +133,7 @@ def do_prediction(image_path, use_cpu=False):
     model.eval()
 
     image = Image.open(image_path)
+    image = torchvision.transforms.functional.resize(image, (800, 600))
     image_tensor = torchvision.transforms.functional.to_tensor(image)
 
     if not use_cpu:
diff --git a/torchvision/csrc/cpu/nms_cpu.cpp b/torchvision/csrc/cpu/nms_cpu.cpp
@@ -4,11 +4,15 @@ template <typename scalar_t>
 at::Tensor nms_cpu_kernel(
     const at::Tensor& dets,
     const at::Tensor& scores,
-    const float iou_threshold) {
+    const double iou_threshold,
+    const long post_nms_top_n) {
   AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
   AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
   AT_ASSERTM(
       dets.type() == scores.type(), "dets should have the same type as scores");
+  AT_ASSERTM(
+      dets.size(0) >= post_nms_top_n,
+      "should have at least post_nms_top_n boxes");
 
   if (dets.numel() == 0)
     return at::empty({0}, dets.options().dtype(at::kLong));
@@ -41,7 +45,11 @@ at::Tensor nms_cpu_kernel(
     auto i = order[_i];
     if (suppressed[i] == 1)
       continue;
+
     keep[num_to_keep++] = i;
+    if (num_to_keep == post_nms_top_n)
+      goto end;
+
     auto ix1 = x1[i];
     auto iy1 = y1[i];
     auto ix2 = x2[i];
@@ -65,17 +73,20 @@ at::Tensor nms_cpu_kernel(
         suppressed[j] = 1;
     }
   }
-  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+
+end:
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/post_nms_top_n);
 }
 
 at::Tensor nms_cpu(
     const at::Tensor& dets,
     const at::Tensor& scores,
-    const float iou_threshold) {
+    const double iou_threshold,
+    const long post_nms_top_n) {
   auto result = at::empty({0}, dets.options());
 
   AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
-    result = nms_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+    result = nms_cpu_kernel<scalar_t>(dets, scores, iou_threshold, post_nms_top_n);
   });
   return result;
 }
diff --git a/torchvision/csrc/cpu/vision_cpu.h b/torchvision/csrc/cpu/vision_cpu.h
@@ -83,4 +83,5 @@ at::Tensor PSROIAlign_backward_cpu(
 at::Tensor nms_cpu(
     const at::Tensor& dets,
     const at::Tensor& scores,
-    const float iou_threshold);
+    const double iou_threshold,
+    const long post_nms_top_n);
diff --git a/torchvision/csrc/nms.h b/torchvision/csrc/nms.h
@@ -8,7 +8,8 @@
 at::Tensor nms(
     const at::Tensor& dets,
     const at::Tensor& scores,
-    const double iou_threshold) {
+    const double iou_threshold,
+    const long post_nms_top_n) {
   if (dets.device().is_cuda()) {
 #ifdef WITH_CUDA
     if (dets.numel() == 0) {
@@ -21,6 +22,6 @@ at::Tensor nms(
 #endif
   }
 
-  at::Tensor result = nms_cpu(dets, scores, iou_threshold);
+  at::Tensor result = nms_cpu(dets, scores, iou_threshold, post_nms_top_n);
   return result;
 }
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
@@ -202,10 +202,10 @@ def decode_single(self, rel_codes, boxes):
         pred_w = torch.exp(dw) * widths[:, None]
         pred_h = torch.exp(dh) * heights[:, None]
 
-        pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w
-        pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h
-        pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w
-        pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h
+        pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_ctr_x.device) * pred_w
+        pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_ctr_y.device) * pred_h
+        pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_ctr_x.device) * pred_w
+        pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_ctr_y.device) * pred_h
         pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
         return pred_boxes
 
diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
@@ -603,9 +603,7 @@ def postprocess_detections(self, class_logits, box_regression, proposals, image_
             boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
 
             # non-maximum suppression, independently done per class
-            keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
-            # keep only topk scoring predictions
-            keep = keep[:self.detections_per_img]
+            keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh, self.detections_per_img)
 
             # keep.shape = [0]
             boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
diff --git a/torchvision/models/detection/rpn.py b/torchvision/models/detection/rpn.py
@@ -60,7 +60,7 @@ def __init__(
         self.sizes = sizes
         self.aspect_ratios = aspect_ratios
         self.cell_anchors = None
-        self._cache = {}
+        # self._cache = {}
 
     @staticmethod
     def generate_anchors(scales, aspect_ratios, dtype=torch.float32, device="cpu"):
@@ -76,8 +76,8 @@ def generate_anchors(scales, aspect_ratios, dtype=torch.float32, device="cpu"):
         return base_anchors.round()
 
     def set_cell_anchors(self, dtype, device):
-        if self.cell_anchors is not None:
-            return self.cell_anchors
+        # if self.cell_anchors is not None:
+            # return self.cell_anchors
         cell_anchors = [
             self.generate_anchors(
                 sizes,
@@ -123,10 +123,10 @@ def grid_anchors(self, grid_sizes, strides):
 
     def cached_grid_anchors(self, grid_sizes, strides):
         key = tuple(grid_sizes) + tuple(strides)
-        if key in self._cache:
-            return self._cache[key]
+        # if key in self._cache:
+            # return self._cache[key]
         anchors = self.grid_anchors(grid_sizes, strides)
-        self._cache[key] = anchors
+        # self._cache[key] = anchors
         return anchors
 
     def forward(self, image_list, feature_maps):
@@ -355,12 +355,9 @@ def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_
         final_scores = []
         for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes):
             boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
-            keep = box_ops.remove_small_boxes(boxes, self.min_size)
-            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
             # non-maximum suppression, independently done per level
-            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)
+            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh, self.post_nms_top_n)
             # keep only topk scoring predictions
-            keep = keep[:self.post_nms_top_n]
             boxes, scores = boxes[keep], scores[keep]
             final_boxes.append(boxes)
             final_scores.append(scores)
diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
@@ -1,7 +1,8 @@
 import torch
+import torch_xla
 
 
-def nms(boxes, scores, iou_threshold):
+def nms(boxes, scores, iou_threshold, post_nms_top_n):
     """
     Performs non-maximum suppression (NMS) on the boxes according
     to their intersection-over-union (IoU).
@@ -28,10 +29,16 @@ def nms(boxes, scores, iou_threshold):
         of the elements that have been kept
         by NMS, sorted in decreasing order of scores
     """
-    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
+    device = boxes.device
+    torch_xla._XLAC._xla_sync_multi([boxes, scores], devices=[])
+    boxes_cpu = boxes.cpu().clone()
+    scores_cpu = scores.cpu().clone()
+    keep = torch.ops.torchvision.nms(boxes_cpu, scores_cpu, iou_threshold)
+    keep = keep.to(device=device)
+    return keep
 
 
-def batched_nms(boxes, scores, idxs, iou_threshold):
+def batched_nms(boxes, scores, idxs, iou_threshold, post_nms_top_n):
     """
     Performs non-maximum suppression in a batched fashion.
 
@@ -67,7 +74,7 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
     max_coordinate = boxes.max()
     offsets = idxs.to(boxes) * (max_coordinate + 1)
     boxes_for_nms = boxes + offsets[:, None]
-    keep = nms(boxes_for_nms, scores, iou_threshold)
+    keep = nms(boxes_for_nms, scores, iou_threshold, post_nms_top_n)
     print("ops/boxes.py; keep.shape: {}".format(keep.shape))
     return keep
 
diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py
@@ -184,10 +184,16 @@ def forward(self, x, boxes, image_shapes):
             idx_in_level = torch.nonzero(levels == level).squeeze(1)
             rois_per_level = rois[idx_in_level]
 
+            xla_device = per_level_feature.device
+            torch_xla._XLAC._xla_sync_multi([per_level_feature, rois_per_level], devices=[])
+
+            per_level_feature_cpu = per_level_feature.cpu().clone()
+            rois_per_level_cpu = rois_per_level.cpu().clone()
             result_idx_in_level = roi_align(
                 per_level_feature, rois_per_level,
                 output_size=self.output_size,
-                spatial_scale=scale, sampling_ratio=self.sampling_ratio)
+                spatial_scale=scale, sampling_ratio=self.sampling_ratio
+            ).to(xla_device)
 
             if torchvision._is_tracing():
                 results.append(result_idx_in_level.to(dtype))