From 10caef70219c3612aa1918c90b771a1880f54015 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Mon, 19 Dec 2016 09:49:29 -0200
Subject: [PATCH 01/18] Changes from yesterday

---
 fast_rcnn/main.py        | 162 +++++++++++++++++++++++++++++++++++++++
 fast_rcnn/model.py       |  29 +++++++
 fast_rcnn/roi_pooling.py |  43 +++++++++++
 fast_rcnn/voc.py         | 142 ++++++++++++++++++++++++++++++++++
 4 files changed, 376 insertions(+)
 create mode 100644 fast_rcnn/main.py
 create mode 100644 fast_rcnn/model.py
 create mode 100644 fast_rcnn/roi_pooling.py
 create mode 100644 fast_rcnn/voc.py

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
new file mode 100644
index 0000000000..2c07a72b9e
--- /dev/null
+++ b/fast_rcnn/main.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn as nn
+import torch.autograd as ag
+import torch.utils.trainer as trainer
+import torch.utils.data
+import numpy as np
+
+from roi_pooling import roi_pooling
+from voc import VOCDetection, TransformVOCDetectionAnnotation
+
+
+cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+class_to_ind = dict(zip(cls, range(len(cls))))
+
+
+train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+
+# two possibilities
+# 1. have a new dataset class that samples random boxes and outputs, like the batch provider
+# 2. let the dataset do it internally
+# lets go for 1
+
+# image flip goes to the dataset class, not BoxSampler
+
+def bbox_overlaps(a, bb):
+  #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b
+  oo = []
+
+  for b in bb:
+
+    x1 = a.select(1,0).clone()
+    x1[x1.lt(b[0])] = b[0] 
+    y1 = a.select(1,1).clone()
+    y1[y1.lt(b[1])] = b[1]
+    x2 = a.select(1,2).clone()
+    x2[x2.gt(b[2])] = b[2]
+    y2 = a.select(1,3).clone()
+    y2[y2.gt(b[3])] = b[3]
+
+    w = x2-x1+1
+    h = y2-y1+1
+    inter = torch.mul(w,h).float()
+    aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float()
+    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
+
+    # intersection over union overlap
+    o = torch.div(inter , (aarea+barea-inter))
+    # set invalid entries to 0 overlap
+    o[w.lt(0)] = 0
+    o[h.lt(0)] = 0
+
+    oo += [o]
+
+  return torch.cat([o.view(-1,1) for o in oo],1)
+
+def _generate_boxes(self, im):
+    #h, w = im.size()[1:]
+    w, h = im.size
+    x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
+    y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
+    
+    x = x[0]
+    y = y[0]
+
+    return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
+
+
+class BoxSampler(torch.utils.data.Dataset):
+
+    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, bg_threshold=(0.0,0.5), generate_boxes=_generate_boxes):
+        super(BoxSampler, self).__init__()
+        self.dataset = dataset
+        self.num_boxes = num_boxes
+        self.fg_fraction = fg_fraction
+        self.fg_threshold = fg_threshold
+        self.bg_threshold = bg_threshold
+        self.generate_boxes = generate_boxes
+
+    def _overlap_and_attribute(self, boxes, gt_roidb):
+
+        #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32)
+        overlaps = np.zeros((boxes.size(0), 20), dtype=np.float32)
+
+        if gt_roidb is not None and gt_roidb['boxes'].size > 0:
+            gt_boxes = gt_roidb['boxes']
+            gt_classes = np.array(gt_roidb['gt_classes'])
+            #gt_overlaps = bbox_overlaps(boxes.astype(np.float),gt_boxes.astype(np.float))
+            gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy()
+            argmaxes = gt_overlaps.argmax(axis=1)
+            maxes = gt_overlaps.max(axis=1)
+
+            # remove low scoring
+            pos = maxes >= self.fg_threshold
+            neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1])
+            maxes[neg] = 0
+            # need to take care of bg_threshold
+
+            I = np.where(maxes > 0)[0]
+            #I = np.where()[0]
+            overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+
+            overlaps = overlaps[pos | neg]
+            boxes = boxes.numpy()
+            boxes = boxes[pos | neg]
+            #argmaxes[maxes == 0] = 0
+            #return torch.from_numpy(argmaxes)
+            return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1))
+
+    def __getitem__(self, idx):
+        #super(BoxSampler, self).__getitem__(idx)
+        im, gt = self.dataset[idx]
+        boxes = self.generate_boxes(self, im)
+        boxes, labels = self._overlap_and_attribute(boxes, gt)
+        return im, boxes, labels
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+ds = BoxSampler(train, 64*32, fg_threshold=0.75)
+
+def collate_fn(batch):
+    imgs, targets = zip(*batch)
+    imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0)
+    targets = torch.LongTensor([[i] + t for i, t in enumerate(targets, 0)])
+
+    return imgs, targets
+
+train_loader = torch.utils.data.DataLoader(
+            train, batch_size=2, shuffle=True, num_workers=1, collate_fn=collate_fn)
+
+
+def show(img, boxes, label, cls=None):
+    from PIL import Image, ImageDraw
+    #img, target = self.__getitem__(index)
+    if cls is None:
+        cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+
+    draw = ImageDraw.Draw(img)
+    for obj, t in zip(boxes, label):
+        if t > 0:
+            draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
+            draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
+        else:
+            pass
+            #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
+    img.show()
+
+
+#im, box, label = ds[10]
+#show(im,box,label)
diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py
new file mode 100644
index 0000000000..44d7dddf9b
--- /dev/null
+++ b/fast_rcnn/model.py
@@ -0,0 +1,29 @@
+import torch.nn as nn
+from roi_pooling import roi_pooling
+
+class Network(nn.Container):
+
+  def __init__(self, features, classifier):
+    super(Network, self).__init__()
+    self.features = features
+    self.classifier = classifier
+  
+  def forward(self, x):
+    images, rois = x
+    x = self.features(images)
+    x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0)
+    x = self.classifier(x)
+    return x
+
+def basic_net():
+  features = nn.Sequential(nn.Conv2d(3,16,3,16,1,1))
+  classifier = nn.Sequential(nn.Linear(3*3*16,10))
+  return Network(features, classifier)
+
+if __name__ == '__main__':
+  import torch
+  import torch.autograd
+  m = basic_net()
+  x = torch.autograd.Variable(torch.rand(1,3,224,224))
+  b = torch.autograd.Variable(torch.LongTensor([[0,1,50,200,200],[0,50,50,200,200]]))
+  o = m((x,b))
diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py
new file mode 100644
index 0000000000..f56cbd6fd4
--- /dev/null
+++ b/fast_rcnn/roi_pooling.py
@@ -0,0 +1,43 @@
+import torch
+import torch.nn as nn
+import torch.autograd as ag
+import math
+
+# approximation for the adaptive max pooling which is currently missing from nn
+# doesn't work if the input is smaller than size
+def adaptive_max_pool(input, size):
+    s = input.size()[2:]
+    assert(s[0]>= size[0] and s[1] >= size[1])
+    ratio = [float(x)/y for x,y in zip(s, size)]
+    kernel_size = [int(math.ceil(x)) for x in ratio]
+    stride = kernel_size
+    remainder = [x*y-z for x, y, z in zip(kernel_size, size, s)]
+    padding = [int(math.floor((x+1)/2)) for x in remainder]
+    return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input)
+    #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input)
+
+def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0):
+    assert(rois.dim() == 2)
+    assert(rois.size(1) == 5)
+    output = []
+    rois = rois.data.float()
+    num_rois = rois.size(0)
+    
+    rois[:,1:].mul_(spatial_scale)
+    rois = rois.long()
+    for i in range(num_rois):
+        roi = rois[i]
+        im_idx = roi[0]
+        im = input.narrow(0, im_idx, 1)[..., roi[2]:roi[4], roi[1]:roi[3]]
+        output.append(adaptive_max_pool(im, size))
+
+    return torch.cat(output, 0)
+
+if __name__ == '__main__':
+    input = ag.Variable(torch.rand(1,1,10,10), requires_grad=True)
+    rois = ag.Variable(torch.LongTensor([[0,1,2,7,8],[0,3,3,8,8]]),requires_grad=False)
+    #rois = ag.Variable(torch.LongTensor([[0,3,3,8,8]]),requires_grad=False)
+
+    out = roi_pooling(input, rois, size=(3,3))
+    out.backward(out.data.clone().uniform_())
+
diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py
new file mode 100644
index 0000000000..0929646e10
--- /dev/null
+++ b/fast_rcnn/voc.py
@@ -0,0 +1,142 @@
+import torch
+import torch.utils.data as data
+from PIL import Image, ImageDraw
+import os
+import os.path
+import sys
+if sys.version_info[0] == 2:
+    import xml.etree.cElementTree as ET
+else:
+    import xml.etree.ElementTree as ET
+
+
+class TransformVOCDetectionAnnotation(object):
+    def __init__(self, class_to_ind, keep_difficult=False):
+        self.keep_difficult = keep_difficult
+        self.class_to_ind = class_to_ind
+
+    def __call__(self, target):
+        #res = []
+        #res = {}
+        boxes = []
+        gt_classes = []
+        for obj in target.iter('object'):
+            difficult = int(obj.find('difficult').text) == 1
+            if not self.keep_difficult and difficult:
+                continue
+            #name = obj.find('name').text
+            name = obj[0].text.lower().strip()
+            #bb = obj.find('bndbox')
+            bbox = obj[4]
+            #bndbox = [bb.find('xmin').text, bb.find('ymin').text,
+            #    bb.find('xmax').text, bb.find('ymax').text]
+            # supposes the order is xmin, ymin, xmax, ymax
+            # attention with indices
+            bndbox = [int(bb.text)-1 for bb in bbox]
+
+            #res += [bndbox + [name]]
+            #res += [bndbox + [class_to_ind[name]]]
+            boxes += [torch.LongTensor(bndbox)]
+            gt_classes += [self.class_to_ind[name]]
+
+        res = {
+            'boxes': torch.cat([b.view(1,-1) for b in boxes], 0),
+            'gt_classes':gt_classes
+        }
+        return res
+
+class VOCSegmentation(data.Dataset):
+    def __init__(self, root, image_set, transform=None, target_transform=None):
+        self.root = root
+        self.image_set = image_set
+        self.transform = transform
+        self.target_transform = target_transform
+
+        dataset_name = 'VOC2007'
+        self._annopath = os.path.join(self.root, dataset_name, 'SegmentationClass', '%s.png')
+        self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg')
+        self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt')
+ 
+        with open(self._imgsetpath % self.image_set) as f:
+            self.ids = f.readlines()
+        self.ids = [x.strip('\n') for x in self.ids]
+
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+
+        target = Image.open(self._annopath % img_id)#.convert('RGB')
+
+        img = Image.open(self._imgpath % img_id).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.ids)
+
+
+class VOCDetection(data.Dataset):
+    def __init__(self, root, image_set, transform=None, target_transform=None):
+        self.root = root
+        self.image_set = image_set
+        self.transform = transform
+        self.target_transform = target_transform
+
+        dataset_name = 'VOC2007'
+        self._annopath = os.path.join(self.root, dataset_name, 'Annotations', '%s.xml')
+        self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg')
+        self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Main', '%s.txt')
+ 
+        with open(self._imgsetpath % self.image_set) as f:
+            self.ids = f.readlines()
+        self.ids = [x.strip('\n') for x in self.ids]
+
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+
+        target = ET.parse(self._annopath % img_id).getroot()
+
+        img = Image.open(self._imgpath % img_id).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.ids)
+
+    def show(self, index):
+        img, target = self.__getitem__(index)
+        draw = ImageDraw.Draw(img)
+        for obj in target:
+            draw.rectangle(obj[0:4], outline=(255,0,0))
+            draw.text(obj[0:2], obj[4], fill=(0,255,0))
+        img.show()
+
+if __name__ == '__main__':
+    cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+    class_to_ind = dict(zip(cls, range(len(cls))))
+
+    ds = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+    print(len(ds))
+    img, target = ds[0]
+    print(target)
+    #ds.show(1)
+    #dss = VOCSegmentation('/home/francisco/work/datasets/VOCdevkit/', 'train')
+    #img, target = dss[0]
+
+    #img.show()
+    #print(target_transform(target))

From 55b2bb0994f44257b9a9b1372df315cf025295b2 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Mon, 19 Dec 2016 09:49:52 -0200
Subject: [PATCH 02/18] Seems to work

---
 fast_rcnn/main.py | 42 ++++++++++++++++++++++++++++++++++--------
 fast_rcnn/voc.py  | 10 +++++-----
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index 2c07a72b9e..f38067aba7 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -8,6 +8,8 @@
 from roi_pooling import roi_pooling
 from voc import VOCDetection, TransformVOCDetectionAnnotation
 
+from tqdm import tqdm
+
 
 cls = ('__background__', # always index 0
             'aeroplane', 'bicycle', 'bird', 'boat',
@@ -85,7 +87,7 @@ def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, b
     def _overlap_and_attribute(self, boxes, gt_roidb):
 
         #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32)
-        overlaps = np.zeros((boxes.size(0), 20), dtype=np.float32)
+        overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32)
 
         if gt_roidb is not None and gt_roidb['boxes'].size > 0:
             gt_boxes = gt_roidb['boxes']
@@ -117,23 +119,40 @@ def __getitem__(self, idx):
         im, gt = self.dataset[idx]
         boxes = self.generate_boxes(self, im)
         boxes, labels = self._overlap_and_attribute(boxes, gt)
+
+        if True:
+            w, h = im.size
+            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
+            im = im.view(h, w, 3)
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            im = im.transpose(0, 1).transpose(0, 2).contiguous()
+            im = im.float().div_(255)
+
         return im, boxes, labels
 
     def __len__(self):
         return len(self.dataset)
 
 
-ds = BoxSampler(train, 64*32, fg_threshold=0.75)
+ds = BoxSampler(train, 64, fg_threshold=0.5)
 
 def collate_fn(batch):
-    imgs, targets = zip(*batch)
-    imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0)
-    targets = torch.LongTensor([[i] + t for i, t in enumerate(targets, 0)])
-
-    return imgs, targets
+    imgs, boxes, labels = zip(*batch)
+    max_size = [max(size) for size in zip(*[im.size() for im in imgs])]
+    new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0)
+    for im, im2 in zip(new_imgs, imgs):
+        im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2)
+    #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0)
+    #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)])
+    #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]
+    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
+    boxes = torch.from_numpy(boxes)
+    labels = torch.cat(labels, 0)
+    return new_imgs, boxes, labels
 
 train_loader = torch.utils.data.DataLoader(
-            train, batch_size=2, shuffle=True, num_workers=1, collate_fn=collate_fn)
+            ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)
 
 
 def show(img, boxes, label, cls=None):
@@ -157,6 +176,13 @@ def show(img, boxes, label, cls=None):
             #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
     img.show()
 
+for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
+    pass
+    #print('====')
+    #print(i)
+    #print(img.size())
+    #print(boxes.size())
+    #print(labels.size())
 
 #im, box, label = ds[10]
 #show(im,box,label)
diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py
index 0929646e10..757f6ceef2 100644
--- a/fast_rcnn/voc.py
+++ b/fast_rcnn/voc.py
@@ -26,13 +26,13 @@ def __call__(self, target):
                 continue
             #name = obj.find('name').text
             name = obj[0].text.lower().strip()
-            #bb = obj.find('bndbox')
-            bbox = obj[4]
-            #bndbox = [bb.find('xmin').text, bb.find('ymin').text,
-            #    bb.find('xmax').text, bb.find('ymax').text]
+            bb = obj.find('bndbox')
+            #bbox = obj[4]
+            bndbox = [int(bb.find('xmin').text), int(bb.find('ymin').text),
+                int(bb.find('xmax').text), int(bb.find('ymax').text)]
             # supposes the order is xmin, ymin, xmax, ymax
             # attention with indices
-            bndbox = [int(bb.text)-1 for bb in bbox]
+            #bndbox = [int(bb.text)-1 for bb in bbox]
 
             #res += [bndbox + [name]]
             #res += [bndbox + [class_to_ind[name]]]

From faa3b4e719601dc18b9b7bdb19f847cfb8c8a6e8 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Mon, 19 Dec 2016 12:03:54 -0200
Subject: [PATCH 03/18] Change generator

---
 fast_rcnn/main.py | 78 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index f38067aba7..fd6581324d 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -23,11 +23,6 @@
 train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
-# two possibilities
-# 1. have a new dataset class that samples random boxes and outputs, like the batch provider
-# 2. let the dataset do it internally
-# lets go for 1
-
 # image flip goes to the dataset class, not BoxSampler
 
 def bbox_overlaps(a, bb):
@@ -61,25 +56,29 @@ def bbox_overlaps(a, bb):
 
   return torch.cat([o.view(-1,1) for o in oo],1)
 
-def _generate_boxes(self, im):
-    #h, w = im.size()[1:]
-    w, h = im.size
-    x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
-    y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
+class BoxGenerator(object):
+    def __init__(self, num_boxes=2000):
+        super(BoxGenerator, self).__init__()
+        self.num_boxes = num_boxes
+
+    def __call__(self, im):
+        #h, w = im.size()[1:]
+        w, h = im.size
+        x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
+        y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
     
-    x = x[0]
-    y = y[0]
+        x = x[0]
+        y = y[0]
 
-    return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
+        return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
 
 
 class BoxSampler(torch.utils.data.Dataset):
 
-    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, bg_threshold=(0.0,0.5), generate_boxes=_generate_boxes):
+    def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), 
+            generate_boxes=BoxGenerator(num_boxes=10000)):
         super(BoxSampler, self).__init__()
         self.dataset = dataset
-        self.num_boxes = num_boxes
-        self.fg_fraction = fg_fraction
         self.fg_threshold = fg_threshold
         self.bg_threshold = bg_threshold
         self.generate_boxes = generate_boxes
@@ -117,10 +116,10 @@ def _overlap_and_attribute(self, boxes, gt_roidb):
     def __getitem__(self, idx):
         #super(BoxSampler, self).__getitem__(idx)
         im, gt = self.dataset[idx]
-        boxes = self.generate_boxes(self, im)
+        boxes = self.generate_boxes(im)
         boxes, labels = self._overlap_and_attribute(boxes, gt)
 
-        if True:
+        if False:
             w, h = im.size
             im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
             im = im.view(h, w, 3)
@@ -135,7 +134,37 @@ def __len__(self):
         return len(self.dataset)
 
 
-ds = BoxSampler(train, 64, fg_threshold=0.5)
+class BoxSelector(torch.utils.data.Dataset):
+    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25):
+        super(BoxSelector, self).__init__()
+        self.dataset = dataset
+        self.num_boxes = num_boxes
+        self.fg_fraction = fg_fraction
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        im, boxes, labels = self.dataset[idx]
+
+        boxes = boxes.numpy()
+        labels = labels.numpy()
+
+        bg = np.where(labels == 0)[0]
+        fg = np.where(labels != 0)[0]
+        nfg = min(len(fg), self.num_boxes*self.fg_fraction)
+        nbg = min(len(bg), self.num_boxes - nfg)
+
+        bg = bg[np.random.permutation(len(bg))[:nbg]]
+        fg = fg[np.random.permutation(len(fg))[:nfg]]
+
+        I = np.concatenate([fg, bg], axis=0)
+
+        return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I])
+
+
+
+ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25)
 
 def collate_fn(batch):
     imgs, boxes, labels = zip(*batch)
@@ -146,7 +175,7 @@ def collate_fn(batch):
     #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0)
     #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)])
     #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]
-    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
+    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
     boxes = torch.from_numpy(boxes)
     labels = torch.cat(labels, 0)
     return new_imgs, boxes, labels
@@ -176,13 +205,14 @@ def show(img, boxes, label, cls=None):
             #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
     img.show()
 
-for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
-    pass
+
+#for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
+#    pass
     #print('====')
     #print(i)
     #print(img.size())
     #print(boxes.size())
     #print(labels.size())
 
-#im, box, label = ds[10]
-#show(im,box,label)
+im, box, label = ds[10]
+show(im,box,label)

From f2e92489b4152da6f6d85f1bdee58e25c2f19eaf Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Thu, 22 Dec 2016 10:57:40 -0200
Subject: [PATCH 04/18] fast rcnn

No bbox regression
---
 fast_rcnn/main.py | 91 ++++++++++++++++++++++++++++++++++++++---------
 fast_rcnn/voc.py  |  4 +--
 2 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index fd6581324d..0cb7795536 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -23,6 +23,13 @@
 train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
+
+# TODO
+# add class information in dataset
+# separate in different files
+# remove hard-coding 21 from Sampler
+# cache the sampled boxes ?
+
 # image flip goes to the dataset class, not BoxSampler
 
 def bbox_overlaps(a, bb):
@@ -91,7 +98,6 @@ def _overlap_and_attribute(self, boxes, gt_roidb):
         if gt_roidb is not None and gt_roidb['boxes'].size > 0:
             gt_boxes = gt_roidb['boxes']
             gt_classes = np.array(gt_roidb['gt_classes'])
-            #gt_overlaps = bbox_overlaps(boxes.astype(np.float),gt_boxes.astype(np.float))
             gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy()
             argmaxes = gt_overlaps.argmax(axis=1)
             maxes = gt_overlaps.max(axis=1)
@@ -100,26 +106,21 @@ def _overlap_and_attribute(self, boxes, gt_roidb):
             pos = maxes >= self.fg_threshold
             neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1])
             maxes[neg] = 0
-            # need to take care of bg_threshold
 
             I = np.where(maxes > 0)[0]
-            #I = np.where()[0]
             overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
 
             overlaps = overlaps[pos | neg]
             boxes = boxes.numpy()
             boxes = boxes[pos | neg]
-            #argmaxes[maxes == 0] = 0
-            #return torch.from_numpy(argmaxes)
             return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1))
 
     def __getitem__(self, idx):
-        #super(BoxSampler, self).__getitem__(idx)
         im, gt = self.dataset[idx]
         boxes = self.generate_boxes(im)
         boxes, labels = self._overlap_and_attribute(boxes, gt)
 
-        if False:
+        if True:
             w, h = im.size
             im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
             im = im.view(h, w, 3)
@@ -163,6 +164,63 @@ def __getitem__(self, idx):
         return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I])
 
 
+class ToPILImage(object):
+    """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W 
+    or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C
+    to a PIL.Image of range [0, 255]
+    """
+    def __call__(self, pic):
+        from PIL import Image, ImageOps
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = Image.fromarray(pic)
+        else:
+            npimg = pic.mul(255).byte().numpy()
+            npimg = np.transpose(npimg, (1,2,0))
+            img = Image.fromarray(npimg)
+        return img
+
+def make_grid(tensor, nrow=8, padding=2):
+    import math
+    """
+    Given a 4D mini-batch Tensor of shape (B x C x H x W),
+    or a list of images all of the same size,
+    makes a grid of images
+    """
+    tensorlist = None
+    if isinstance(tensor, list):
+        tensorlist = tensor
+        numImages = len(tensorlist)
+        size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size())
+        tensor = tensorlist[0].new(size)
+        for i in range(numImages):
+            tensor[i].copy_(tensorlist[i])
+    if tensor.dim() == 2: # single image H x W
+        tensor = tensor.view(1, tensor.size(0), tensor.size(1))
+    if tensor.dim() == 3: # single image
+        if tensor.size(0) == 1:
+            tensor = torch.cat((tensor, tensor, tensor), 0)
+        return tensor
+    if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images
+        tensor = torch.cat((tensor, tensor, tensor), 1)
+    # make the mini-batch of images into a grid
+    nmaps = tensor.size(0)
+    xmaps = min(nrow, nmaps)
+    ymaps = int(math.ceil(nmaps / xmaps))
+    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
+    grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max())
+    k = 0
+    for y in range(ymaps):
+        for x in range(xmaps):
+            if k >= nmaps:
+                break
+            grid.narrow(1, y*height+1+padding//2,height-padding)\
+                .narrow(2, x*width+1+padding//2, width-padding)\
+                .copy_(tensor[k])
+            k = k + 1
+    return grid
+
+
 
 ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25)
 
@@ -172,9 +230,6 @@ def collate_fn(batch):
     new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0)
     for im, im2 in zip(new_imgs, imgs):
         im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2)
-    #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0)
-    #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)])
-    #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]
     boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
     boxes = torch.from_numpy(boxes)
     labels = torch.cat(labels, 0)
@@ -201,18 +256,22 @@ def show(img, boxes, label, cls=None):
             draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
             draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
         else:
-            pass
-            #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
+            #pass
+            draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
     img.show()
 
 
-#for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
-#    pass
+for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
+    #grid = make_grid(img, 2, 1)
+    #grid = ToPILImage()(grid)
+    #grid.show()
+    #break
+    pass
     #print('====')
     #print(i)
     #print(img.size())
     #print(boxes.size())
     #print(labels.size())
 
-im, box, label = ds[10]
-show(im,box,label)
+#im, box, label = ds[10]
+#show(im,box,label)
diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py
index 757f6ceef2..164b6b9dd4 100644
--- a/fast_rcnn/voc.py
+++ b/fast_rcnn/voc.py
@@ -28,8 +28,8 @@ def __call__(self, target):
             name = obj[0].text.lower().strip()
             bb = obj.find('bndbox')
             #bbox = obj[4]
-            bndbox = [int(bb.find('xmin').text), int(bb.find('ymin').text),
-                int(bb.find('xmax').text), int(bb.find('ymax').text)]
+            bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text,
+                bb.find('xmax').text, bb.find('ymax').text])
             # supposes the order is xmin, ymin, xmax, ymax
             # attention with indices
             #bndbox = [int(bb.text)-1 for bb in bbox]

From 3b3f1aeb9c106230cc2cd42d6e0a17da002b5718 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Thu, 22 Dec 2016 10:58:13 -0200
Subject: [PATCH 05/18] Starting to prototype faster rcnn

---
 fast_rcnn/bbox_transform.py |  83 +++++++++++++++
 fast_rcnn/faster_rcnn.py    | 203 ++++++++++++++++++++++++++++++++++++
 2 files changed, 286 insertions(+)
 create mode 100644 fast_rcnn/bbox_transform.py
 create mode 100644 fast_rcnn/faster_rcnn.py

diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py
new file mode 100644
index 0000000000..358cdc9e07
--- /dev/null
+++ b/fast_rcnn/bbox_transform.py
@@ -0,0 +1,83 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def bbox_transform(ex_rois, gt_rois):
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    targets = np.vstack(
+        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return targets
+
+def bbox_transform_inv(boxes, deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+
+    return pred_boxes
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    """
+
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+def filter_boxes(boxes, min_size):
+    """Remove all boxes with any side smaller than min_size."""
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+    return keep
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
new file mode 100644
index 0000000000..73a10a98eb
--- /dev/null
+++ b/fast_rcnn/faster_rcnn.py
@@ -0,0 +1,203 @@
+import torch.nn as nn
+import numpy as np
+
+# clean up environment
+from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes
+
+class FasterRCNN(nn.Container):
+
+  def __init__(self):
+    self.rpn_param = {
+        '_feat_stride':16
+    }
+
+  # need to have a train and test mode
+  # should it support batched images ?
+  # need to pass target as argument only in train mode
+  def forward(self, x):
+    if self.train is True:
+      im, gt = x
+      # call model.train() here ?
+    else
+      im = x
+
+    feats = self._features(im)
+
+    # improve
+    # it is used in get_anchors and also present in roi_pooling
+    self._feat_stride = round(im.size(4)/feats.size(4))
+    # rpn
+    # put in a separate function
+    rpn_map, rpn_bbox_transform = self._rpn_classifier(feats)
+    all_anchors = self.rpn_get_anchors(im)
+    #rpn_boxes = self.rpn_estimate(all_anchors, rpn_map)
+    if self.train is True:
+      rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt)
+      # need to subsample boxes here
+      rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets)
+
+    # roi proposal
+    # clip, sort, pre nms topk, nms, after nms topk
+    # proposal_layer.py
+    # roi_boxes = self.get_roi_boxes(rpn_map, rpn_boxes)
+    roi_boxes = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform)
+
+    if self.train is True:
+      # append gt boxes and sample fg / bg boxes
+      # proposal_target-layer.py
+      roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
+
+    # r-cnn
+    regions = self._roi_pooling(feats, roi_boxes)
+    scores, bbox_transform = self._classifier(regions)
+
+    boxes = self.bbox_reg(roi_boxes, bbox_transform)
+
+    # apply cls + bbox reg loss here
+    if self.train is True:
+      frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets)
+      loss = frcnn_loss + rpn_loss
+      return loss, scores, boxes
+
+    return scores, boxes
+
+  # the user define their model in here
+  def _features(self, x):
+    # _feat_stride should be defined / inferred from here
+    pass
+  def _classifier(self, x):
+    pass
+  def _roi_pooling(self, x):
+    pass
+  def _rpn_classifier(self, x):
+    pass
+
+  # from faster rcnn py
+  def rpn_get_anchors(self, im):
+    height, width = im.size()[-2:]
+    # 1. Generate proposals from bbox deltas and shifted anchors
+    shift_x = np.arange(0, width) * self._feat_stride
+    shift_y = np.arange(0, height) * self._feat_stride
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                        shift_x.ravel(), shift_y.ravel())).transpose()
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+    A = self._num_anchors
+    K = shifts.shape[0]
+    all_anchors = (self._anchors.reshape((1, A, 4)) +
+                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+    all_anchors = all_anchors.reshape((K * A, 4))
+    return all_anchors
+
+  # restructure because we don't want -1 in labels
+  # shouldn't we instead keep only the bboxes for which labels >= 0?
+  def rpn_targets(self, all_anchors, im, gt):
+    total_anchors = all_anchors.shape[0]
+    gt_boxes = gt['boxes']
+
+    height, width = im.size()[-2:]
+    # only keep anchors inside the image
+    _allowed_border = 0
+    inds_inside = np.where(
+         (all_anchors[:, 0] >= -_allowed_border) &
+         (all_anchors[:, 1] >= -_allowed_border) &
+         (all_anchors[:, 2] < width  + _allowed_border) &  # width
+         (all_anchors[:, 3] < height + _allowed_border)    # height
+     )[0]
+     
+     # keep only inside anchors
+     anchors = all_anchors[inds_inside, :]
+
+    # label: 1 is positive, 0 is negative, -1 is dont care
+    labels = np.empty((len(inds_inside), ), dtype=np.float32)
+    labels.fill(-1)
+
+    # overlaps between the anchors and the gt boxes
+    # overlaps (ex, gt)
+    overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy()
+    argmax_overlaps = overlaps.argmax(axis=1)
+    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+    gt_argmax_overlaps = overlaps.argmax(axis=0)
+    gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                               np.arange(overlaps.shape[1])]
+    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+    
+    # assign bg labels first so that positive labels can clobber them
+    labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+    # fg label: for each gt, anchor with highest overlap
+    labels[gt_argmax_overlaps] = 1
+
+    # fg label: above threshold IOU
+    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+    
+    # subsample positive labels if we have too many
+    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg:
+      disable_inds = npr.choice(
+          fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+      labels[disable_inds] = -1
+
+    # subsample negative labels if we have too many
+    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
+    bg_inds = np.where(labels == 0)[0]
+    if len(bg_inds) > num_bg:
+      disable_inds = npr.choice(
+          bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+      labels[disable_inds] = -1
+
+    #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
+    bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :])
+
+    # map up to original set of anchors
+    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+
+    return labels, bbox_targets
+
+
+  # I need to know the original image size (or have the scaling factor)
+  def get_roi_boxes(self, all_anchors, rpn_map, rpn_bbox_deltas)
+
+    bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+    # the first set of _num_anchors channels are bg probs
+    # the second set are the fg probs, which we want
+    #scores = bottom[0].data[:, self._num_anchors:, :, :]
+    scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+    # Convert anchors into proposals via bbox transformations
+    proposals = bbox_transform_inv(anchors, bbox_deltas)
+
+    # 2. clip predicted boxes to image
+    proposals = clip_boxes(proposals, im_info[:2])
+
+    # 3. remove predicted boxes with either height or width < threshold
+    # (NOTE: convert min_size to input image scale stored in im_info[2])
+    keep = filter_boxes(proposals, min_size * im_info[2])
+    proposals = proposals[keep, :]
+    scores = scores[keep]
+
+    # 4. sort all (proposal, score) pairs by score from highest to lowest
+    # 5. take top pre_nms_topN (e.g. 6000)
+    order = scores.ravel().argsort()[::-1]
+    if pre_nms_topN > 0:
+      order = order[:pre_nms_topN]
+    proposals = proposals[order, :]
+    scores = scores[order]
+
+    # 6. apply nms (e.g. threshold = 0.7)
+    # 7. take after_nms_topN (e.g. 300)
+    # 8. return the top proposals (-> RoIs top)
+    keep = nms(np.hstack((proposals, scores)), nms_thresh)
+    if post_nms_topN > 0:
+      keep = keep[:post_nms_topN]
+    proposals = proposals[keep, :]
+    scores = scores[keep]
+
+    return roi_boxes

From c653216c77f210b6458b6ace2d8cbc1a75dad891 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 23 Dec 2016 13:14:36 -0200
Subject: [PATCH 06/18] rpn runs

Need to test for correctness
---
 fast_rcnn/bbox_transform.py   |  35 ++++
 fast_rcnn/faster_rcnn.py      | 275 +++++++++++++-----------------
 fast_rcnn/generate_anchors.py | 105 ++++++++++++
 fast_rcnn/py_cpu_nms.py       |  38 +++++
 fast_rcnn/rpn.py              | 309 ++++++++++++++++++++++++++++++++++
 5 files changed, 603 insertions(+), 159 deletions(-)
 create mode 100644 fast_rcnn/generate_anchors.py
 create mode 100644 fast_rcnn/py_cpu_nms.py
 create mode 100644 fast_rcnn/rpn.py

diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py
index 358cdc9e07..2775d1a3a4 100644
--- a/fast_rcnn/bbox_transform.py
+++ b/fast_rcnn/bbox_transform.py
@@ -5,6 +5,7 @@
 # Written by Ross Girshick
 # --------------------------------------------------------
 
+import torch
 import numpy as np
 
 def bbox_transform(ex_rois, gt_rois):
@@ -81,3 +82,37 @@ def filter_boxes(boxes, min_size):
     hs = boxes[:, 3] - boxes[:, 1] + 1
     keep = np.where((ws >= min_size) & (hs >= min_size))[0]
     return keep
+
+
+# torch tensors
+def bbox_overlaps(a, bb):
+  oo = []
+
+  for b in bb:
+
+    x1 = a.select(1,0).clone()
+    x1[x1.lt(b[0])] = b[0] 
+    y1 = a.select(1,1).clone()
+    y1[y1.lt(b[1])] = b[1]
+    x2 = a.select(1,2).clone()
+    x2[x2.gt(b[2])] = b[2]
+    y2 = a.select(1,3).clone()
+    y2[y2.gt(b[3])] = b[3]
+
+    w = x2-x1+1
+    h = y2-y1+1
+    inter = torch.mul(w,h).float()
+    aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float()
+    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
+
+    # intersection over union overlap
+    o = torch.div(inter , (aarea+barea-inter))
+    # set invalid entries to 0 overlap
+    o[w.lt(0)] = 0
+    o[h.lt(0)] = 0
+
+    oo += [o]
+
+  return torch.cat([o.view(-1,1) for o in oo],1)
+
+
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 73a10a98eb..33e9ad1918 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -1,21 +1,19 @@
 import torch.nn as nn
 import numpy as np
 
-# clean up environment
-from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes
-
+# should handle multiple scales, how?
 class FasterRCNN(nn.Container):
 
   def __init__(self):
-    self.rpn_param = {
-        '_feat_stride':16
-    }
+    super(FasterRCNN, self).__init__()
+    self.batch_size = 128
+    self.fg_fraction = 0.25
+    self.fg_threshold = 0.5
+    self.bg_threshold = (0, 0.5)
 
-  # need to have a train and test mode
   # should it support batched images ?
-  # need to pass target as argument only in train mode
   def forward(self, x):
-    if self.train is True:
+    if self.training is True:
       im, gt = x
       # call model.train() here ?
     else
@@ -23,26 +21,9 @@ def forward(self, x):
 
     feats = self._features(im)
 
-    # improve
-    # it is used in get_anchors and also present in roi_pooling
-    self._feat_stride = round(im.size(4)/feats.size(4))
-    # rpn
-    # put in a separate function
-    rpn_map, rpn_bbox_transform = self._rpn_classifier(feats)
-    all_anchors = self.rpn_get_anchors(im)
-    #rpn_boxes = self.rpn_estimate(all_anchors, rpn_map)
-    if self.train is True:
-      rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt)
-      # need to subsample boxes here
-      rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets)
-
-    # roi proposal
-    # clip, sort, pre nms topk, nms, after nms topk
-    # proposal_layer.py
-    # roi_boxes = self.get_roi_boxes(rpn_map, rpn_boxes)
-    roi_boxes = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform)
-
-    if self.train is True:
+    roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt)
+
+    if self.training is True:
       # append gt boxes and sample fg / bg boxes
       # proposal_target-layer.py
       roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
@@ -54,7 +35,7 @@ def forward(self, x):
     boxes = self.bbox_reg(roi_boxes, bbox_transform)
 
     # apply cls + bbox reg loss here
-    if self.train is True:
+    if self.training is True:
       frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets)
       loss = frcnn_loss + rpn_loss
       return loss, scores, boxes
@@ -63,141 +44,117 @@ def forward(self, x):
 
   # the user define their model in here
   def _features(self, x):
-    # _feat_stride should be defined / inferred from here
     pass
   def _classifier(self, x):
     pass
   def _roi_pooling(self, x):
     pass
-  def _rpn_classifier(self, x):
-    pass
 
-  # from faster rcnn py
-  def rpn_get_anchors(self, im):
-    height, width = im.size()[-2:]
-    # 1. Generate proposals from bbox deltas and shifted anchors
-    shift_x = np.arange(0, width) * self._feat_stride
-    shift_y = np.arange(0, height) * self._feat_stride
-    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
-                        shift_x.ravel(), shift_y.ravel())).transpose()
-    # add A anchors (1, A, 4) to
-    # cell K shifts (K, 1, 4) to get
-    # shift anchors (K, A, 4)
-    # reshape to (K*A, 4) shifted anchors
-    A = self._num_anchors
-    K = shifts.shape[0]
-    all_anchors = (self._anchors.reshape((1, A, 4)) +
-                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
-    all_anchors = all_anchors.reshape((K * A, 4))
-    return all_anchors
-
-  # restructure because we don't want -1 in labels
-  # shouldn't we instead keep only the bboxes for which labels >= 0?
-  def rpn_targets(self, all_anchors, im, gt):
-    total_anchors = all_anchors.shape[0]
+  def frcnn_targets(self, all_rois, im, gt):
     gt_boxes = gt['boxes']
-
-    height, width = im.size()[-2:]
-    # only keep anchors inside the image
-    _allowed_border = 0
-    inds_inside = np.where(
-         (all_anchors[:, 0] >= -_allowed_border) &
-         (all_anchors[:, 1] >= -_allowed_border) &
-         (all_anchors[:, 2] < width  + _allowed_border) &  # width
-         (all_anchors[:, 3] < height + _allowed_border)    # height
-     )[0]
-     
-     # keep only inside anchors
-     anchors = all_anchors[inds_inside, :]
-
-    # label: 1 is positive, 0 is negative, -1 is dont care
-    labels = np.empty((len(inds_inside), ), dtype=np.float32)
-    labels.fill(-1)
-
-    # overlaps between the anchors and the gt boxes
-    # overlaps (ex, gt)
-    overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy()
-    argmax_overlaps = overlaps.argmax(axis=1)
-    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
-    gt_argmax_overlaps = overlaps.argmax(axis=0)
-    gt_max_overlaps = overlaps[gt_argmax_overlaps,
-                               np.arange(overlaps.shape[1])]
-    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+    gt_labels = gt['gt_classes']
+    #zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
+    #all_rois = np.vstack(
+    #    (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
+    #)
+    all_rois = np.vstack(all_rois, gt_boxes)
+    zeros = np.zeros((all_rois.shape[0], 1), dtype=all_rois.dtype)
+    all_rois = np.hstack((zeros, all_rois))
     
-    # assign bg labels first so that positive labels can clobber them
-    labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
-
-    # fg label: for each gt, anchor with highest overlap
-    labels[gt_argmax_overlaps] = 1
-
-    # fg label: above threshold IOU
-    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+    num_images = 1
+    rois_per_image = self.batch_size / num_images
+    fg_rois_per_image = np.round(self.fg_fraction * rois_per_image)
+    
+    # Sample rois with classification labels and bounding box regression
+    # targets
+    labels, rois, bbox_targets = _sample_rois(
+        all_rois, gt_boxes, gt_labels, fg_rois_per_image,
+        rois_per_image, self._num_classes)
+    
+    return all_rois, labels, rois, bbox_targets
     
-    # subsample positive labels if we have too many
-    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg:
-      disable_inds = npr.choice(
-          fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-      labels[disable_inds] = -1
-
-    # subsample negative labels if we have too many
-    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
-    bg_inds = np.where(labels == 0)[0]
-    if len(bg_inds) > num_bg:
-      disable_inds = npr.choice(
-          bg_inds, size=(len(bg_inds) - num_bg), replace=False)
-      labels[disable_inds] = -1
-
-    #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
-    #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
-    bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :])
-
-    # map up to original set of anchors
-    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
-    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
-
-    return labels, bbox_targets
-
-
-  # I need to know the original image size (or have the scaling factor)
-  def get_roi_boxes(self, all_anchors, rpn_map, rpn_bbox_deltas)
-
-    bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
-
-    # the first set of _num_anchors channels are bg probs
-    # the second set are the fg probs, which we want
-    #scores = bottom[0].data[:, self._num_anchors:, :, :]
-    scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
-
-    # Convert anchors into proposals via bbox transformations
-    proposals = bbox_transform_inv(anchors, bbox_deltas)
-
-    # 2. clip predicted boxes to image
-    proposals = clip_boxes(proposals, im_info[:2])
-
-    # 3. remove predicted boxes with either height or width < threshold
-    # (NOTE: convert min_size to input image scale stored in im_info[2])
-    keep = filter_boxes(proposals, min_size * im_info[2])
-    proposals = proposals[keep, :]
-    scores = scores[keep]
-
-    # 4. sort all (proposal, score) pairs by score from highest to lowest
-    # 5. take top pre_nms_topN (e.g. 6000)
-    order = scores.ravel().argsort()[::-1]
-    if pre_nms_topN > 0:
-      order = order[:pre_nms_topN]
-    proposals = proposals[order, :]
-    scores = scores[order]
-
-    # 6. apply nms (e.g. threshold = 0.7)
-    # 7. take after_nms_topN (e.g. 300)
-    # 8. return the top proposals (-> RoIs top)
-    keep = nms(np.hstack((proposals, scores)), nms_thresh)
-    if post_nms_topN > 0:
-      keep = keep[:post_nms_topN]
-    proposals = proposals[keep, :]
-    scores = scores[keep]
-
-    return roi_boxes
+def _get_bbox_regression_labels(bbox_target_data, num_classes):
+    """Bounding-box regression targets (bbox_target_data) are stored in a
+    compact form N x (class, tx, ty, tw, th)
+    This function expands those targets into the 4-of-4*K representation used
+    by the network (i.e. only one class has non-zero targets).
+    Returns:
+        bbox_target (ndarray): N x 4K blob of regression targets
+        bbox_inside_weights (ndarray): N x 4K blob of loss weights
+    """
+
+    clss = bbox_target_data[:, 0]
+    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
+    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    inds = np.where(clss > 0)[0]
+    for ind in inds:
+        cls = clss[ind]
+        start = 4 * cls
+        end = start + 4
+        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
+    return bbox_targets
+
+
+def _compute_targets(ex_rois, gt_rois, labels):
+    """Compute bounding-box regression targets for an image."""
+
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 4
+
+    targets = bbox_transform(ex_rois, gt_rois)
+    if False: #cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
+        # Optionally normalize targets by a precomputed mean and stdev
+        targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
+                / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
+    return np.hstack(
+            (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
+
+def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes):
+    """Generate a random sample of RoIs comprising foreground and background
+    examples.
+    """
+    # overlaps: (rois x gt_boxes)
+    overlaps = bbox_overlaps(
+        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
+        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
+    gt_assignment = overlaps.argmax(axis=1)
+    max_overlaps = overlaps.max(axis=1)
+    #labels = gt_boxes[gt_assignment, 4]
+    labels = gt_labels[gt_assignment]
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(max_overlaps >= self.fg_threshold)[0]
+    # Guard against the case when an image has fewer than fg_rois_per_image
+    # foreground RoIs
+    fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds = np.where((max_overlaps < self.bg_threshold[1]) &
+                       (max_overlaps >= self.bg_threshold[0]))[0]
+    # Compute number of background RoIs to take from this image (guarding
+    # against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
+    # Sample background regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays:
+    labels = labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[fg_rois_per_this_image:] = 0
+    rois = all_rois[keep_inds]
+
+    bbox_target_data = _compute_targets(
+        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
+
+    bbox_targets = \
+        _get_bbox_regression_labels(bbox_target_data, num_classes)
+
+    return labels, rois, bbox_targets
diff --git a/fast_rcnn/generate_anchors.py b/fast_rcnn/generate_anchors.py
new file mode 100644
index 0000000000..1125a801fe
--- /dev/null
+++ b/fast_rcnn/generate_anchors.py
@@ -0,0 +1,105 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+import numpy as np
+
+# Verify that we compute the same anchors as Shaoqing's matlab implementation:
+#
+#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
+#    >> anchors
+#
+#    anchors =
+#
+#       -83   -39   100    56
+#      -175   -87   192   104
+#      -359  -183   376   200
+#       -55   -55    72    72
+#      -119  -119   136   136
+#      -247  -247   264   264
+#       -35   -79    52    96
+#       -79  -167    96   184
+#      -167  -343   184   360
+
+#array([[ -83.,  -39.,  100.,   56.],
+#       [-175.,  -87.,  192.,  104.],
+#       [-359., -183.,  376.,  200.],
+#       [ -55.,  -55.,   72.,   72.],
+#       [-119., -119.,  136.,  136.],
+#       [-247., -247.,  264.,  264.],
+#       [ -35.,  -79.,   52.,   96.],
+#       [ -79., -167.,   96.,  184.],
+#       [-167., -343.,  184.,  360.]])
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2**np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in xrange(ratio_anchors.shape[0])])
+    return anchors
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+if __name__ == '__main__':
+    import time
+    t = time.time()
+    a = generate_anchors()
+    print time.time() - t
+    print a
+    from IPython import embed; embed()
diff --git a/fast_rcnn/py_cpu_nms.py b/fast_rcnn/py_cpu_nms.py
new file mode 100644
index 0000000000..54e7b25fef
--- /dev/null
+++ b/fast_rcnn/py_cpu_nms.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
new file mode 100644
index 0000000000..6e1f61d15d
--- /dev/null
+++ b/fast_rcnn/rpn.py
@@ -0,0 +1,309 @@
+#import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import numpy as np
+import numpy.random as npr
+
+# clean up environment
+from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
+from generate_anchors import generate_anchors
+
+from py_cpu_nms import py_cpu_nms as nms
+
+class RPN(nn.Container):
+
+  def __init__(self):
+    super(RPN, self).__init__()
+
+    anchor_scales = (8, 16, 32)
+    self._anchors = generate_anchors(scales=np.array(anchor_scales))
+    self._num_anchors = self._anchors.shape[0]
+
+    self.negative_overlap = 0.3
+    self.positive_overlap = 0.7
+    self.fg_fraction = 0.5
+    self.batch_size = 256
+
+    # used for both train and test
+    self.nms_thresh = 0.7
+    self.pre_nms_topN = 12000
+    self.post_nms_topN = 2000
+    self.min_size = 16
+
+
+  # output rpn probs as well
+  def forward(self, im, feats, gt=None):
+    # improve
+    # it is used in get_anchors and also present in roi_pooling
+    self._feat_stride = round(im.size(3)/feats.size(3))
+    # rpn
+    # put in a separate function
+    rpn_map, rpn_bbox_transform = self._rpn_classifier(feats)
+    all_anchors = self.rpn_get_anchors(feats)
+    rpn_loss = None
+    if self.training is True:
+      assert gt is not None
+      rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt)
+      # need to subsample boxes here
+      rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets)
+
+    # roi proposal
+    # clip, sort, pre nms topk, nms, after nms topk
+    # params are different for train and test
+    # proposal_layer.py
+    roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform, im)
+    # only for visualization
+    #roi_boxes = all_anchors
+
+    #return roi_boxes, scores, rpn_loss
+    return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss
+    #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \
+    #    Variable(torch.from_numpy(rpn_labels))
+
+  def _rpn_classifier(self, x):
+    x = Variable(x, requires_grad=True)
+    m1 = nn.Conv2d(3, 18, 3, 1, 1)
+    m2 = nn.Conv2d(3, 36, 3, 1, 1)
+    return m1(x), m2(x)
+    #pass
+
+  # from faster rcnn py
+  def rpn_get_anchors(self, im):
+    height, width = im.size()[-2:]
+    # 1. Generate proposals from bbox deltas and shifted anchors
+    shift_x = np.arange(0, width) * self._feat_stride
+    shift_y = np.arange(0, height) * self._feat_stride
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                        shift_x.ravel(), shift_y.ravel())).transpose()
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+    A = self._num_anchors
+    K = shifts.shape[0]
+    all_anchors = (self._anchors.reshape((1, A, 4)) +
+                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+    all_anchors = all_anchors.reshape((K * A, 4))
+    return all_anchors
+
+  # restructure because we don't want -1 in labels
+  # shouldn't we instead keep only the bboxes for which labels >= 0?
+  def rpn_targets(self, all_anchors, im, gt):
+    total_anchors = all_anchors.shape[0]
+    gt_boxes = gt['boxes']
+
+    height, width = im.size()[-2:]
+    # only keep anchors inside the image
+    _allowed_border = 0
+    inds_inside = np.where(
+         (all_anchors[:, 0] >= -_allowed_border) &
+         (all_anchors[:, 1] >= -_allowed_border) &
+         (all_anchors[:, 2] < width  + _allowed_border) &  # width
+         (all_anchors[:, 3] < height + _allowed_border)    # height
+    )[0]
+     
+    # keep only inside anchors
+    anchors = all_anchors[inds_inside, :]
+
+    # label: 1 is positive, 0 is negative, -1 is dont care
+    labels = np.empty((len(inds_inside), ), dtype=np.float32)
+    labels.fill(-1)
+
+    # overlaps between the anchors and the gt boxes
+    # overlaps (ex, gt)
+    #overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy()
+    overlaps = bbox_overlaps(torch.from_numpy(anchors), gt_boxes).numpy()
+    gt_boxes = gt_boxes.numpy()
+    argmax_overlaps = overlaps.argmax(axis=1)
+    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+    gt_argmax_overlaps = overlaps.argmax(axis=0)
+    gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                               np.arange(overlaps.shape[1])]
+    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+    
+    # assign bg labels first so that positive labels can clobber them
+    labels[max_overlaps < self.negative_overlap] = 0
+
+    # fg label: for each gt, anchor with highest overlap
+    labels[gt_argmax_overlaps] = 1
+
+    # fg label: above threshold IOU
+    labels[max_overlaps >= self.positive_overlap] = 1
+    
+    # subsample positive labels if we have too many
+    num_fg = int(self.fg_fraction * self.batch_size)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg:
+      disable_inds = npr.choice(
+          fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+      labels[disable_inds] = -1
+
+    # subsample negative labels if we have too many
+    num_bg = self.batch_size - np.sum(labels == 1)
+    bg_inds = np.where(labels == 0)[0]
+    if len(bg_inds) > num_bg:
+      disable_inds = npr.choice(
+          bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+      labels[disable_inds] = -1
+
+    #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
+    bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :])
+
+    # map up to original set of anchors
+    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+
+    return labels, bbox_targets
+
+  # I need to know the original image size (or have the scaling factor)
+  def get_roi_boxes(self, anchors, rpn_map, rpn_bbox_deltas, im):
+    # TODO fix this!!!
+    im_info = (100, 100, 1)
+
+    bbox_deltas = rpn_bbox_deltas.data.numpy()
+    bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+    # the first set of _num_anchors channels are bg probs
+    # the second set are the fg probs, which we want
+    #scores = bottom[0].data[:, self._num_anchors:, :, :]
+    scores = rpn_map.data[:, self._num_anchors:, :, :].numpy()
+    scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+    # Convert anchors into proposals via bbox transformations
+    proposals = bbox_transform_inv(anchors, bbox_deltas)
+
+    # 2. clip predicted boxes to image
+    proposals = clip_boxes(proposals, im.size()[-2:])
+
+    # 3. remove predicted boxes with either height or width < threshold
+    # (NOTE: convert min_size to input image scale stored in im_info[2])
+    keep = filter_boxes(proposals, self.min_size * im_info[2])
+    proposals = proposals[keep, :]
+    scores = scores[keep]
+
+    # 4. sort all (proposal, score) pairs by score from highest to lowest
+    # 5. take top pre_nms_topN (e.g. 6000)
+    order = scores.ravel().argsort()[::-1]
+    if self.pre_nms_topN > 0:
+      order = order[:self.pre_nms_topN]
+    proposals = proposals[order, :]
+    scores = scores[order]
+
+    # 6. apply nms (e.g. threshold = 0.7)
+    # 7. take after_nms_topN (e.g. 300)
+    # 8. return the top proposals (-> RoIs top)
+    keep = nms(np.hstack((proposals, scores)), self.nms_thresh)
+    if self.post_nms_topN > 0:
+      keep = keep[:self.post_nms_topN]
+    proposals = proposals[keep, :]
+    scores = scores[keep]
+
+    return proposals, scores
+
+  def rpn_loss(self, rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets):
+    height, width = rpn_map.size()[-2:]
+
+    rpn_map = rpn_map.view(-1, 2, height, width).permute(0,2,3,1).contiguous().view(-1, 2)
+    labels = torch.from_numpy(rpn_labels).long() # convert properly
+    labels = labels.view(1, height, width, -1).permute(0, 3, 1, 2).contiguous()
+    labels = labels.view(-1)
+  
+    idx = labels.ge(0).nonzero()[:,0]
+    rpn_map = rpn_map.index_select(0, Variable(idx, requires_grad=False))
+    labels = labels.index_select(0, idx)
+    labels = Variable(labels, requires_grad=False)
+    
+    rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets)
+    rpn_bbox_targets = rpn_bbox_targets.view(1, height, width, -1).permute(0, 3, 1, 2)
+    rpn_bbox_targets = Variable(rpn_bbox_targets, requires_grad=False)
+
+    cls_crit = nn.CrossEntropyLoss()
+    reg_crit = nn.SmoothL1Loss()
+    cls_loss = cls_crit(rpn_map, labels)
+    # verify normalization and sigma
+    reg_loss = reg_crit(rpn_bbox_transform, rpn_bbox_targets)
+
+    loss = cls_loss + reg_loss
+    return loss
+
+def _unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if len(data.shape) == 1:
+        ret = np.empty((count, ), dtype=np.float32)
+        ret.fill(fill)
+        ret[inds] = data
+    else:
+        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
+        ret.fill(fill)
+        ret[inds, :] = data
+    return ret
+
+
+def show(img, boxes, label):
+    from PIL import Image, ImageDraw
+    #img, target = self.__getitem__(index)
+    draw = ImageDraw.Draw(img)
+    for obj, t in zip(boxes, label):
+        #print(type(t))
+        if t == 1:
+            #print(t)
+            draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
+            #draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
+        #else:
+        elif t == 0:
+            pass
+            #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
+    img.show()
+
+
+
+if __name__ == '__main__':
+  import torch
+  from voc import VOCDetection, TransformVOCDetectionAnnotation
+  rpn = RPN()
+  cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+  class_to_ind = dict(zip(cls, range(len(cls))))
+
+
+  train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+  
+  im, gt = train[11]
+  im0 = im
+
+  if True:
+            w, h = im.size
+            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
+            im = im.view(h, w, 3)
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            im = im.transpose(0, 1).transpose(0, 2).contiguous()
+            im = im.float().div_(255).unsqueeze(0)
+
+
+
+  feats = torch.rand(1,3,im.size(2)/16, im.size(3)/16)
+  print(feats.size())
+  print(im.size())
+
+  #rpn.eval()
+  rpn.train()
+  import time
+  t = time.time()
+  #boxes, scores, loss, labels = rpn(im, feats, gt)
+  boxes, scores, loss = rpn(im, feats, gt)
+  print time.time() - t
+  print loss
+  loss.backward()
+
+  #show(im0, boxes.data, labels.data.int().tolist())
+
+  #from IPython import embed; embed()

From 3bee8e6fa82864201cba12068b1abe7cad9a0b8e Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 11:51:08 -0200
Subject: [PATCH 07/18] frcnn runs

---
 fast_rcnn/bbox_transform.py |  6 +++
 fast_rcnn/faster_rcnn.py    | 73 +++++++++++++++++++++++++++++--------
 fast_rcnn/main2.py          | 46 +++++++++++++++++++++++
 fast_rcnn/rpn.py            |  4 +-
 4 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 fast_rcnn/main2.py

diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py
index 2775d1a3a4..c134cea059 100644
--- a/fast_rcnn/bbox_transform.py
+++ b/fast_rcnn/bbox_transform.py
@@ -86,6 +86,12 @@ def filter_boxes(boxes, min_size):
 
 # torch tensors
 def bbox_overlaps(a, bb):
+  if isinstance(a, np.ndarray):
+    a = torch.from_numpy(a)
+
+  if isinstance(bb, np.ndarray):
+    bb = torch.from_numpy(bb)
+
   oo = []
 
   for b in bb:
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 33e9ad1918..30d21cc184 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -1,6 +1,15 @@
+import torch
 import torch.nn as nn
+from torch.autograd import Variable
 import numpy as np
+import numpy.random as npr
+from rpn import RPN
 
+from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
+
+m1 = nn.Conv2d(3, 3, 3, 16, 1)
+m2 = nn.Linear(3*3*3, 21)
+m3 = nn.Linear(3*3*3, 21*4)
 # should handle multiple scales, how?
 class FasterRCNN(nn.Container):
 
@@ -10,33 +19,35 @@ def __init__(self):
     self.fg_fraction = 0.25
     self.fg_threshold = 0.5
     self.bg_threshold = (0, 0.5)
+    self._num_classes = 21
+    self.rpn = RPN()
 
   # should it support batched images ?
   def forward(self, x):
     if self.training is True:
       im, gt = x
       # call model.train() here ?
-    else
+    else:
       im = x
 
-    feats = self._features(im)
+    feats = self._features(_tovar(im))
 
     roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt)
 
     if self.training is True:
       # append gt boxes and sample fg / bg boxes
       # proposal_target-layer.py
-      roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
+      all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
 
     # r-cnn
     regions = self._roi_pooling(feats, roi_boxes)
     scores, bbox_transform = self._classifier(regions)
 
-    boxes = self.bbox_reg(roi_boxes, bbox_transform)
+    boxes = self.bbox_reg(roi_boxes, bbox_transform, im)
 
     # apply cls + bbox reg loss here
     if self.training is True:
-      frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets)
+      frcnn_loss = self.frcnn_loss(scores, bbox_transform, frcnn_labels, frcnn_bbox_targets)
       loss = frcnn_loss + rpn_loss
       return loss, scores, boxes
 
@@ -44,20 +55,33 @@ def forward(self, x):
 
   # the user define their model in here
   def _features(self, x):
-    pass
+    return m1(x)
   def _classifier(self, x):
-    pass
-  def _roi_pooling(self, x):
-    pass
+    return m2(x), m3(x)
+  def _roi_pooling(self, x, rois):
+    from roi_pooling import roi_pooling
+    x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0)
+    return x.view(x.size(0), -1)
+
+  def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets):
+    cls_crit = nn.CrossEntropyLoss()
+    cls_loss = cls_crit(scores, labels)
+
+    reg_crit = nn.SmoothL1Loss()
+    reg_loss = reg_crit(bbox_transform, bbox_targets)
+
+    loss = cls_loss + reg_loss
+    return loss
 
   def frcnn_targets(self, all_rois, im, gt):
-    gt_boxes = gt['boxes']
-    gt_labels = gt['gt_classes']
+    all_rois = all_rois.data.numpy()
+    gt_boxes = gt['boxes'].numpy()
+    gt_labels = np.array(gt['gt_classes'])
     #zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
     #all_rois = np.vstack(
     #    (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
     #)
-    all_rois = np.vstack(all_rois, gt_boxes)
+    all_rois = np.vstack((all_rois, gt_boxes))
     zeros = np.zeros((all_rois.shape[0], 1), dtype=all_rois.dtype)
     all_rois = np.hstack((zeros, all_rois))
     
@@ -67,11 +91,18 @@ def frcnn_targets(self, all_rois, im, gt):
     
     # Sample rois with classification labels and bounding box regression
     # targets
-    labels, rois, bbox_targets = _sample_rois(
+    labels, rois, bbox_targets = _sample_rois(self,
         all_rois, gt_boxes, gt_labels, fg_rois_per_image,
         rois_per_image, self._num_classes)
     
-    return all_rois, labels, rois, bbox_targets
+    return _tovar((all_rois, labels, rois, bbox_targets))
+
+  def bbox_reg(self, boxes, box_deltas, im):
+    boxes = boxes.data[:,1:].numpy()
+    box_deltas = box_deltas.data.numpy()
+    pred_boxes = bbox_transform_inv(boxes, box_deltas)
+    pred_boxes = clip_boxes(pred_boxes, im.size()[-2:])
+    return _tovar(pred_boxes)
     
 def _get_bbox_regression_labels(bbox_target_data, num_classes):
     """Bounding-box regression targets (bbox_target_data) are stored in a
@@ -110,7 +141,7 @@ def _compute_targets(ex_rois, gt_rois, labels):
     return np.hstack(
             (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
 
-def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes):
+def _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes):
     """Generate a random sample of RoIs comprising foreground and background
     examples.
     """
@@ -118,6 +149,7 @@ def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_imag
     overlaps = bbox_overlaps(
         np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
         np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
+    overlaps = overlaps.numpy()
     gt_assignment = overlaps.argmax(axis=1)
     max_overlaps = overlaps.max(axis=1)
     #labels = gt_boxes[gt_assignment, 4]
@@ -158,3 +190,14 @@ def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_imag
         _get_bbox_regression_labels(bbox_target_data, num_classes)
 
     return labels, rois, bbox_targets
+
+def _tovar(x):
+    if isinstance(x, np.ndarray):
+        return Variable(torch.from_numpy(x), requires_grad=False)
+    elif torch.is_tensor(x):
+        return Variable(x, requires_grad=True)
+    elif isinstance(x, tuple):
+        t = []
+        for i in x:
+            t.append(_tovar(i))
+        return t
diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py
new file mode 100644
index 0000000000..3ea273531a
--- /dev/null
+++ b/fast_rcnn/main2.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+import torch.autograd as ag
+import torch.utils.trainer as trainer
+import torch.utils.data
+import numpy as np
+
+from roi_pooling import roi_pooling
+from voc import VOCDetection, TransformVOCDetectionAnnotation
+from faster_rcnn import FasterRCNN
+
+cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+class_to_ind = dict(zip(cls, range(len(cls))))
+
+
+train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+
+
+#train_loader = torch.utils.data.DataLoader(
+#            ds, batch_size=1, shuffle=True, num_workers=0)
+
+frcnn = FasterRCNN()
+
+frcnn.train()
+#for i, (im, gt) in (enumerate(train_loader)):
+
+im, gt = train[0]
+if True:
+            w, h = im.size
+            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
+            im = im.view(h, w, 3)
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            im = im.transpose(0, 1).transpose(0, 2).contiguous()
+            im = im.float().div_(255)
+            im = im.unsqueeze(0)
+
+
+loss, scores, boxes = frcnn((im, gt))
+from IPython import embed; embed()
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index 6e1f61d15d..94dafc5cd0 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -1,4 +1,4 @@
-#import torch
+import torch
 import torch.nn as nn
 from torch.autograd import Variable
 import numpy as np
@@ -61,7 +61,7 @@ def forward(self, im, feats, gt=None):
     #    Variable(torch.from_numpy(rpn_labels))
 
   def _rpn_classifier(self, x):
-    x = Variable(x, requires_grad=True)
+    #x = Variable(x, requires_grad=True)
     m1 = nn.Conv2d(3, 18, 3, 1, 1)
     m2 = nn.Conv2d(3, 36, 3, 1, 1)
     return m1(x), m2(x)

From 22e769688662001e8addb077cf64bbbb687c6f00 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 19:06:33 -0200
Subject: [PATCH 08/18] updating

---
 fast_rcnn/faster_rcnn.py |  6 +++---
 fast_rcnn/main2.py       | 35 ++++++++++++++++----------------
 fast_rcnn/roi_pooling.py | 44 ++++++++++++++++++++++++++++++++++++++--
 fast_rcnn/rpn.py         |  1 +
 4 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 30d21cc184..440cd5bd75 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -8,8 +8,8 @@
 from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
 
 m1 = nn.Conv2d(3, 3, 3, 16, 1)
-m2 = nn.Linear(3*3*3, 21)
-m3 = nn.Linear(3*3*3, 21*4)
+m2 = nn.Linear(3*7*7, 21)
+m3 = nn.Linear(3*7*7, 21*4)
 # should handle multiple scales, how?
 class FasterRCNN(nn.Container):
 
@@ -60,7 +60,7 @@ def _classifier(self, x):
     return m2(x), m3(x)
   def _roi_pooling(self, x, rois):
     from roi_pooling import roi_pooling
-    x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0)
+    x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
     return x.view(x.size(0), -1)
 
   def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets):
diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py
index 3ea273531a..a448e7dee6 100644
--- a/fast_rcnn/main2.py
+++ b/fast_rcnn/main2.py
@@ -4,10 +4,12 @@
 import torch.utils.trainer as trainer
 import torch.utils.data
 import numpy as np
+import torchvision.transforms as transforms
 
 from roi_pooling import roi_pooling
 from voc import VOCDetection, TransformVOCDetectionAnnotation
 from faster_rcnn import FasterRCNN
+from tqdm import tqdm
 
 cls = ('__background__', # always index 0
             'aeroplane', 'bicycle', 'bird', 'boat',
@@ -19,28 +21,25 @@
 
 
 train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
+def collate_fn(batch):
+    imgs, gt = zip(*batch)
+    return imgs[0].unsqueeze(0), gt[0]
 
-#train_loader = torch.utils.data.DataLoader(
-#            ds, batch_size=1, shuffle=True, num_workers=0)
+train_loader = torch.utils.data.DataLoader(
+            train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn)
 
 frcnn = FasterRCNN()
 
 frcnn.train()
-#for i, (im, gt) in (enumerate(train_loader)):
-
-im, gt = train[0]
-if True:
-            w, h = im.size
-            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
-            im = im.view(h, w, 3)
-            # put it from HWC to CHW format
-            # yikes, this transpose takes 80% of the loading time/CPU
-            im = im.transpose(0, 1).transpose(0, 2).contiguous()
-            im = im.float().div_(255)
-            im = im.unsqueeze(0)
-
-
-loss, scores, boxes = frcnn((im, gt))
-from IPython import embed; embed()
+for i, (im, gt) in tqdm(enumerate(train_loader)):
+  loss, scores, boxes = frcnn((im, gt))
+  loss.backward()
+
+#im, gt = train[0]
+#im = im.unsqueeze(0)
+
+#loss, scores, boxes = frcnn((im, gt))
+#from IPython import embed; embed()
diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py
index f56cbd6fd4..5b808a653f 100644
--- a/fast_rcnn/roi_pooling.py
+++ b/fast_rcnn/roi_pooling.py
@@ -3,9 +3,42 @@
 import torch.autograd as ag
 import math
 
+#import torch.nn.functions as F
+from torch.autograd.function import Function
+from torch._thnn import type2backend
+
+
+class AdaptiveMaxPool2d(Function):
+    def __init__(self, out_w, out_h):
+        super(AdaptiveMaxPool2d, self).__init__()
+        self.out_w = out_w
+        self.out_h = out_h
+
+    def forward(self, input):
+        output = input.new()
+        indices = input.new().long()
+        self.save_for_backward(input)
+        self.indices = indices
+        self._backend = type2backend[type(input)]
+        self._backend.SpatialAdaptiveMaxPooling_updateOutput(
+                self._backend.library_state, input, output, indices,
+                self.out_w, self.out_h)
+        return output
+
+    def backward(self, grad_output):
+        input, = self.saved_tensors
+        indices = self.indices
+        grad_input = grad_output.new()
+        self._backend.SpatialAdaptiveMaxPooling_updateGradInput(
+                self._backend.library_state, input, grad_output, grad_input,
+                indices)
+        return grad_input, None
+
+
+
 # approximation for the adaptive max pooling which is currently missing from nn
 # doesn't work if the input is smaller than size
-def adaptive_max_pool(input, size):
+def adaptive_max_pool_old(input, size):
     s = input.size()[2:]
     assert(s[0]>= size[0] and s[1] >= size[1])
     ratio = [float(x)/y for x,y in zip(s, size)]
@@ -16,6 +49,10 @@ def adaptive_max_pool(input, size):
     return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input)
     #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input)
 
+def adaptive_max_pool(input, size):
+    #return F.thnn.AdaptiveMaxPool2d(size[0],size[1])(input)
+    return AdaptiveMaxPool2d(size[0],size[1])(input)
+
 def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0):
     assert(rois.dim() == 2)
     assert(rois.size(1) == 5)
@@ -28,7 +65,7 @@ def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0):
     for i in range(num_rois):
         roi = rois[i]
         im_idx = roi[0]
-        im = input.narrow(0, im_idx, 1)[..., roi[2]:roi[4], roi[1]:roi[3]]
+        im = input.narrow(0, im_idx, 1)[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)]
         output.append(adaptive_max_pool(im, size))
 
     return torch.cat(output, 0)
@@ -38,6 +75,9 @@ def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0):
     rois = ag.Variable(torch.LongTensor([[0,1,2,7,8],[0,3,3,8,8]]),requires_grad=False)
     #rois = ag.Variable(torch.LongTensor([[0,3,3,8,8]]),requires_grad=False)
 
+    out = adaptive_max_pool(input,(3,3))
+    out.backward(out.data.clone().uniform_())
+
     out = roi_pooling(input, rois, size=(3,3))
     out.backward(out.data.clone().uniform_())
 
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index 94dafc5cd0..d0e2cdfb26 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -105,6 +105,7 @@ def rpn_targets(self, all_anchors, im, gt):
      
     # keep only inside anchors
     anchors = all_anchors[inds_inside, :]
+    assert anchors.shape[0] > 0
 
     # label: 1 is positive, 0 is negative, -1 is dont care
     labels = np.empty((len(inds_inside), ), dtype=np.float32)

From 5e71e6cc304e79fbfab9875163bf7b67df8de3e0 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 19:37:35 -0200
Subject: [PATCH 09/18] A bit of organization

---
 fast_rcnn/bbox_transform.py |  1 -
 fast_rcnn/faster_rcnn.py    | 29 ++++++++-------------
 fast_rcnn/main2.py          | 51 ++++++++++++++++++++++++++++++++++---
 fast_rcnn/roi_pooling.py    | 18 -------------
 fast_rcnn/rpn.py            | 41 +++++++++++++++--------------
 5 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py
index c134cea059..e4c60ac223 100644
--- a/fast_rcnn/bbox_transform.py
+++ b/fast_rcnn/bbox_transform.py
@@ -88,7 +88,6 @@ def filter_boxes(boxes, min_size):
 def bbox_overlaps(a, bb):
   if isinstance(a, np.ndarray):
     a = torch.from_numpy(a)
-
   if isinstance(bb, np.ndarray):
     bb = torch.from_numpy(bb)
 
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 440cd5bd75..019ca8abe0 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -3,24 +3,25 @@
 from torch.autograd import Variable
 import numpy as np
 import numpy.random as npr
-from rpn import RPN
 
 from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
 
-m1 = nn.Conv2d(3, 3, 3, 16, 1)
-m2 = nn.Linear(3*7*7, 21)
-m3 = nn.Linear(3*7*7, 21*4)
 # should handle multiple scales, how?
 class FasterRCNN(nn.Container):
 
-  def __init__(self):
+  def __init__(self, features, pooler, classifier, rpn):
     super(FasterRCNN, self).__init__()
+    self.features = features
+    self.roi_pooling = pooler
+    self.classifier = classifier
+    self.rpn = rpn
+    
     self.batch_size = 128
     self.fg_fraction = 0.25
     self.fg_threshold = 0.5
     self.bg_threshold = (0, 0.5)
     self._num_classes = 21
-    self.rpn = RPN()
+
 
   # should it support batched images ?
   def forward(self, x):
@@ -30,7 +31,7 @@ def forward(self, x):
     else:
       im = x
 
-    feats = self._features(_tovar(im))
+    feats = self.features(_tovar(im))
 
     roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt)
 
@@ -40,8 +41,8 @@ def forward(self, x):
       all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
 
     # r-cnn
-    regions = self._roi_pooling(feats, roi_boxes)
-    scores, bbox_transform = self._classifier(regions)
+    regions = self.roi_pooling(feats, roi_boxes)
+    scores, bbox_transform = self.classifier(regions)
 
     boxes = self.bbox_reg(roi_boxes, bbox_transform, im)
 
@@ -53,16 +54,6 @@ def forward(self, x):
 
     return scores, boxes
 
-  # the user define their model in here
-  def _features(self, x):
-    return m1(x)
-  def _classifier(self, x):
-    return m2(x), m3(x)
-  def _roi_pooling(self, x, rois):
-    from roi_pooling import roi_pooling
-    x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
-    return x.view(x.size(0), -1)
-
   def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets):
     cls_crit = nn.CrossEntropyLoss()
     cls_loss = cls_crit(scores, labels)
diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py
index a448e7dee6..5c791807f2 100644
--- a/fast_rcnn/main2.py
+++ b/fast_rcnn/main2.py
@@ -5,6 +5,8 @@
 import torch.utils.data
 import numpy as np
 import torchvision.transforms as transforms
+from rpn import RPN
+import torch.optim as optim
 
 from roi_pooling import roi_pooling
 from voc import VOCDetection, TransformVOCDetectionAnnotation
@@ -31,12 +33,53 @@ def collate_fn(batch):
 train_loader = torch.utils.data.DataLoader(
             train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn)
 
-frcnn = FasterRCNN()
+class Features(nn.Container):
+  def __init__(self):
+    super(Features, self).__init__()
+    self.m = nn.Conv2d(3, 3, 3, 16, 1)
+
+  def forward(self, x):
+    return self.m(x)
+
+class Classifier(nn.Container):
+  def __init__(self):
+    super(Classifier, self).__init__()
+    self.m1 = nn.Linear(3*7*7, 21)
+    self.m2 = nn.Linear(3*7*7, 21*4)
+
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
+
+def pooler(x, rois):
+  from roi_pooling import roi_pooling
+  x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
+  return x.view(x.size(0), -1)
+
+class RPNClassifier(nn.Container):
+  def __init__(self, n):
+    super(RPNClassifier, self).__init__()
+    self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
+    self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
+
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
+
+rpn = RPN(RPNClassifier(3))
+
+frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn)
 
 frcnn.train()
-for i, (im, gt) in tqdm(enumerate(train_loader)):
-  loss, scores, boxes = frcnn((im, gt))
-  loss.backward()
+
+optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9)
+
+
+from IPython import embed; embed()
+
+#for i, (im, gt) in tqdm(enumerate(train_loader)):
+#  optimizer.zero_grad()
+#  loss, scores, boxes = frcnn((im, gt))
+#  loss.backward()
+#  optimizer.step()
 
 #im, gt = train[0]
 #im = im.unsqueeze(0)
diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py
index 5b808a653f..885e3a734a 100644
--- a/fast_rcnn/roi_pooling.py
+++ b/fast_rcnn/roi_pooling.py
@@ -3,11 +3,9 @@
 import torch.autograd as ag
 import math
 
-#import torch.nn.functions as F
 from torch.autograd.function import Function
 from torch._thnn import type2backend
 
-
 class AdaptiveMaxPool2d(Function):
     def __init__(self, out_w, out_h):
         super(AdaptiveMaxPool2d, self).__init__()
@@ -34,23 +32,7 @@ def backward(self, grad_output):
                 indices)
         return grad_input, None
 
-
-
-# approximation for the adaptive max pooling which is currently missing from nn
-# doesn't work if the input is smaller than size
-def adaptive_max_pool_old(input, size):
-    s = input.size()[2:]
-    assert(s[0]>= size[0] and s[1] >= size[1])
-    ratio = [float(x)/y for x,y in zip(s, size)]
-    kernel_size = [int(math.ceil(x)) for x in ratio]
-    stride = kernel_size
-    remainder = [x*y-z for x, y, z in zip(kernel_size, size, s)]
-    padding = [int(math.floor((x+1)/2)) for x in remainder]
-    return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input)
-    #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input)
-
 def adaptive_max_pool(input, size):
-    #return F.thnn.AdaptiveMaxPool2d(size[0],size[1])(input)
     return AdaptiveMaxPool2d(size[0],size[1])(input)
 
 def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0):
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index d0e2cdfb26..788c02e73a 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -12,9 +12,11 @@
 
 class RPN(nn.Container):
 
-  def __init__(self):
+  def __init__(self, classifier):
     super(RPN, self).__init__()
 
+    self.rpn_classifier = classifier
+
     anchor_scales = (8, 16, 32)
     self._anchors = generate_anchors(scales=np.array(anchor_scales))
     self._num_anchors = self._anchors.shape[0]
@@ -38,7 +40,7 @@ def forward(self, im, feats, gt=None):
     self._feat_stride = round(im.size(3)/feats.size(3))
     # rpn
     # put in a separate function
-    rpn_map, rpn_bbox_transform = self._rpn_classifier(feats)
+    rpn_map, rpn_bbox_transform = self.rpn_classifier(feats)
     all_anchors = self.rpn_get_anchors(feats)
     rpn_loss = None
     if self.training is True:
@@ -60,13 +62,6 @@ def forward(self, im, feats, gt=None):
     #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \
     #    Variable(torch.from_numpy(rpn_labels))
 
-  def _rpn_classifier(self, x):
-    #x = Variable(x, requires_grad=True)
-    m1 = nn.Conv2d(3, 18, 3, 1, 1)
-    m2 = nn.Conv2d(3, 36, 3, 1, 1)
-    return m1(x), m2(x)
-    #pass
-
   # from faster rcnn py
   def rpn_get_anchors(self, im):
     height, width = im.size()[-2:]
@@ -264,7 +259,19 @@ def show(img, boxes, label):
 if __name__ == '__main__':
   import torch
   from voc import VOCDetection, TransformVOCDetectionAnnotation
-  rpn = RPN()
+  import torchvision.transforms as transforms
+
+  class RPNClassifier(nn.Container):
+    def __init__(self, n):
+      super(RPNClassifier, self).__init__()
+      self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
+      self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
+
+    def forward(self, x):
+      return self.m1(x), self.m2(x)
+
+
+  rpn = RPN(RPNClassifier(3))
   cls = ('__background__', # always index 0
             'aeroplane', 'bicycle', 'bird', 'boat',
             'bottle', 'bus', 'car', 'cat', 'chair',
@@ -275,23 +282,15 @@ def show(img, boxes, label):
 
 
   train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
   
   im, gt = train[11]
   im0 = im
 
-  if True:
-            w, h = im.size
-            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
-            im = im.view(h, w, 3)
-            # put it from HWC to CHW format
-            # yikes, this transpose takes 80% of the loading time/CPU
-            im = im.transpose(0, 1).transpose(0, 2).contiguous()
-            im = im.float().div_(255).unsqueeze(0)
-
-
+  im = im.unsqueeze(0)
 
-  feats = torch.rand(1,3,im.size(2)/16, im.size(3)/16)
+  feats = Variable(torch.rand(1,3,im.size(2)/16, im.size(3)/16))
   print(feats.size())
   print(im.size())
 

From 9e65a2f15aeeb392d2385c0d88b927ad73d728df Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 20:18:58 -0200
Subject: [PATCH 10/18] Organization

---
 fast_rcnn/bbox_transform.py | 14 ++++++++++-
 fast_rcnn/faster_rcnn.py    | 46 ++++++++++++++++++-------------------
 fast_rcnn/main2.py          | 12 +++++-----
 fast_rcnn/rpn.py            | 40 ++++++++++++++++++--------------
 4 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py
index e4c60ac223..a443cd1153 100644
--- a/fast_rcnn/bbox_transform.py
+++ b/fast_rcnn/bbox_transform.py
@@ -6,6 +6,7 @@
 # --------------------------------------------------------
 
 import torch
+from torch.autograd import Variable
 import numpy as np
 
 def bbox_transform(ex_rois, gt_rois):
@@ -120,4 +121,15 @@ def bbox_overlaps(a, bb):
 
   return torch.cat([o.view(-1,1) for o in oo],1)
 
-
+def to_var(x):
+    if isinstance(x, np.ndarray):
+        return Variable(torch.from_numpy(x), requires_grad=False)
+    elif torch.is_tensor(x):
+        return Variable(x, requires_grad=True)
+    elif isinstance(x, tuple):
+        t = []
+        for i in x:
+            t.append(to_var(i))
+        return t
+    elif isinstance(x, Variable):
+        return x
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 019ca8abe0..f0e07455a2 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -4,30 +4,39 @@
 import numpy as np
 import numpy.random as npr
 
-from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
+from bbox_transform import \
+    bbox_transform, bbox_transform_inv, clip_boxes, bbox_overlaps
+
+from bbox_transform import to_var as _tovar
 
 # should handle multiple scales, how?
 class FasterRCNN(nn.Container):
 
-  def __init__(self, features, pooler, classifier, rpn):
+  def __init__(self,
+          features, pooler,
+          classifier, rpn,
+          batch_size=128, fg_fraction=0.25,
+          fg_threshold=0.5, bg_threshold=None,
+          num_classes=21):
     super(FasterRCNN, self).__init__()
     self.features = features
     self.roi_pooling = pooler
     self.classifier = classifier
     self.rpn = rpn
     
-    self.batch_size = 128
-    self.fg_fraction = 0.25
-    self.fg_threshold = 0.5
-    self.bg_threshold = (0, 0.5)
-    self._num_classes = 21
+    self.batch_size = batch_size
+    self.fg_fraction = fg_fraction
+    self.fg_threshold = fg_threshold
+    if bg_threshold is None:
+        bg_threshold = (0, 0.5)
+    self.bg_threshold = bg_threshold
+    self._num_classes = num_classes
 
 
   # should it support batched images ?
   def forward(self, x):
     if self.training is True:
       im, gt = x
-      # call model.train() here ?
     else:
       im = x
 
@@ -42,24 +51,24 @@ def forward(self, x):
 
     # r-cnn
     regions = self.roi_pooling(feats, roi_boxes)
-    scores, bbox_transform = self.classifier(regions)
+    scores, bbox_pred = self.classifier(regions)
 
-    boxes = self.bbox_reg(roi_boxes, bbox_transform, im)
+    boxes = self.bbox_reg(roi_boxes, bbox_pred, im)
 
     # apply cls + bbox reg loss here
     if self.training is True:
-      frcnn_loss = self.frcnn_loss(scores, bbox_transform, frcnn_labels, frcnn_bbox_targets)
+      frcnn_loss = self.frcnn_loss(scores, bbox_pred, frcnn_labels, frcnn_bbox_targets)
       loss = frcnn_loss + rpn_loss
       return loss, scores, boxes
 
     return scores, boxes
 
-  def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets):
+  def frcnn_loss(self, scores, bbox_pred, labels, bbox_targets):
     cls_crit = nn.CrossEntropyLoss()
     cls_loss = cls_crit(scores, labels)
 
     reg_crit = nn.SmoothL1Loss()
-    reg_loss = reg_crit(bbox_transform, bbox_targets)
+    reg_loss = reg_crit(bbox_pred, bbox_targets)
 
     loss = cls_loss + reg_loss
     return loss
@@ -182,13 +191,4 @@ def _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_pe
 
     return labels, rois, bbox_targets
 
-def _tovar(x):
-    if isinstance(x, np.ndarray):
-        return Variable(torch.from_numpy(x), requires_grad=False)
-    elif torch.is_tensor(x):
-        return Variable(x, requires_grad=True)
-    elif isinstance(x, tuple):
-        t = []
-        for i in x:
-            t.append(_tovar(i))
-        return t
+
diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py
index 5c791807f2..604c07e263 100644
--- a/fast_rcnn/main2.py
+++ b/fast_rcnn/main2.py
@@ -73,13 +73,13 @@ def forward(self, x):
 optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9)
 
 
-from IPython import embed; embed()
+#from IPython import embed; embed()
 
-#for i, (im, gt) in tqdm(enumerate(train_loader)):
-#  optimizer.zero_grad()
-#  loss, scores, boxes = frcnn((im, gt))
-#  loss.backward()
-#  optimizer.step()
+for i, (im, gt) in tqdm(enumerate(train_loader)):
+  optimizer.zero_grad()
+  loss, scores, boxes = frcnn((im, gt))
+  loss.backward()
+  optimizer.step()
 
 #im, gt = train[0]
 #im = im.unsqueeze(0)
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index 788c02e73a..bbbe2f6096 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -8,29 +8,38 @@
 from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
 from generate_anchors import generate_anchors
 
+from bbox_transform import to_var as _tovar
+
 from py_cpu_nms import py_cpu_nms as nms
 
 class RPN(nn.Container):
 
-  def __init__(self, classifier):
+  def __init__(self,
+      classifier, anchor_scales=None,
+      negative_overlap=0.3, positive_overlap=0.7,
+      fg_fraction=0.5, batch_size=256,
+      nms_thresh=0.7, min_size=16,
+      pre_nms_topN=12000, post_nms_topN=2000
+      ):
     super(RPN, self).__init__()
 
     self.rpn_classifier = classifier
 
-    anchor_scales = (8, 16, 32)
+    if anchor_scales is None:
+      anchor_scales = (8, 16, 32)
     self._anchors = generate_anchors(scales=np.array(anchor_scales))
     self._num_anchors = self._anchors.shape[0]
 
-    self.negative_overlap = 0.3
-    self.positive_overlap = 0.7
-    self.fg_fraction = 0.5
-    self.batch_size = 256
+    self.negative_overlap = negative_overlap
+    self.positive_overlap = positive_overlap
+    self.fg_fraction = fg_fraction
+    self.batch_size = batch_size
 
     # used for both train and test
-    self.nms_thresh = 0.7
-    self.pre_nms_topN = 12000
-    self.post_nms_topN = 2000
-    self.min_size = 16
+    self.nms_thresh = nms_thresh
+    self.pre_nms_topN = pre_nms_topN
+    self.post_nms_topN = post_nms_topN
+    self.min_size = min_size
 
 
   # output rpn probs as well
@@ -40,27 +49,24 @@ def forward(self, im, feats, gt=None):
     self._feat_stride = round(im.size(3)/feats.size(3))
     # rpn
     # put in a separate function
-    rpn_map, rpn_bbox_transform = self.rpn_classifier(feats)
+    rpn_map, rpn_bbox_pred = self.rpn_classifier(feats)
     all_anchors = self.rpn_get_anchors(feats)
     rpn_loss = None
     if self.training is True:
       assert gt is not None
       rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt)
       # need to subsample boxes here
-      rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets)
+      rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_pred, rpn_labels, rpn_bbox_targets)
 
     # roi proposal
     # clip, sort, pre nms topk, nms, after nms topk
     # params are different for train and test
     # proposal_layer.py
-    roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform, im)
+    roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_pred, im)
     # only for visualization
     #roi_boxes = all_anchors
 
-    #return roi_boxes, scores, rpn_loss
-    return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss
-    #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \
-    #    Variable(torch.from_numpy(rpn_labels))
+    return _tovar((roi_boxes, scores, rpn_loss))
 
   # from faster rcnn py
   def rpn_get_anchors(self, im):

From e1196727132a5086f17ed868efa3ade2853b6e12 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 20:19:36 -0200
Subject: [PATCH 11/18] Rename

---
 fast_rcnn/main.py     | 285 +++++++-----------------------------------
 fast_rcnn/main2.py    |  88 -------------
 fast_rcnn/main_old.py | 277 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 325 insertions(+), 325 deletions(-)
 delete mode 100644 fast_rcnn/main2.py
 create mode 100644 fast_rcnn/main_old.py

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index 0cb7795536..604c07e263 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -4,13 +4,15 @@
 import torch.utils.trainer as trainer
 import torch.utils.data
 import numpy as np
+import torchvision.transforms as transforms
+from rpn import RPN
+import torch.optim as optim
 
 from roi_pooling import roi_pooling
 from voc import VOCDetection, TransformVOCDetectionAnnotation
-
+from faster_rcnn import FasterRCNN
 from tqdm import tqdm
 
-
 cls = ('__background__', # always index 0
             'aeroplane', 'bicycle', 'bird', 'boat',
             'bottle', 'bus', 'car', 'cat', 'chair',
@@ -21,257 +23,66 @@
 
 
 train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
+def collate_fn(batch):
+    imgs, gt = zip(*batch)
+    return imgs[0].unsqueeze(0), gt[0]
 
-# TODO
-# add class information in dataset
-# separate in different files
-# remove hard-coding 21 from Sampler
-# cache the sampled boxes ?
-
-# image flip goes to the dataset class, not BoxSampler
-
-def bbox_overlaps(a, bb):
-  #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b
-  oo = []
-
-  for b in bb:
-
-    x1 = a.select(1,0).clone()
-    x1[x1.lt(b[0])] = b[0] 
-    y1 = a.select(1,1).clone()
-    y1[y1.lt(b[1])] = b[1]
-    x2 = a.select(1,2).clone()
-    x2[x2.gt(b[2])] = b[2]
-    y2 = a.select(1,3).clone()
-    y2[y2.gt(b[3])] = b[3]
-
-    w = x2-x1+1
-    h = y2-y1+1
-    inter = torch.mul(w,h).float()
-    aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float()
-    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
-
-    # intersection over union overlap
-    o = torch.div(inter , (aarea+barea-inter))
-    # set invalid entries to 0 overlap
-    o[w.lt(0)] = 0
-    o[h.lt(0)] = 0
-
-    oo += [o]
-
-  return torch.cat([o.view(-1,1) for o in oo],1)
-
-class BoxGenerator(object):
-    def __init__(self, num_boxes=2000):
-        super(BoxGenerator, self).__init__()
-        self.num_boxes = num_boxes
-
-    def __call__(self, im):
-        #h, w = im.size()[1:]
-        w, h = im.size
-        x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
-        y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
-    
-        x = x[0]
-        y = y[0]
-
-        return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
-
-
-class BoxSampler(torch.utils.data.Dataset):
-
-    def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), 
-            generate_boxes=BoxGenerator(num_boxes=10000)):
-        super(BoxSampler, self).__init__()
-        self.dataset = dataset
-        self.fg_threshold = fg_threshold
-        self.bg_threshold = bg_threshold
-        self.generate_boxes = generate_boxes
-
-    def _overlap_and_attribute(self, boxes, gt_roidb):
-
-        #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32)
-        overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32)
-
-        if gt_roidb is not None and gt_roidb['boxes'].size > 0:
-            gt_boxes = gt_roidb['boxes']
-            gt_classes = np.array(gt_roidb['gt_classes'])
-            gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy()
-            argmaxes = gt_overlaps.argmax(axis=1)
-            maxes = gt_overlaps.max(axis=1)
-
-            # remove low scoring
-            pos = maxes >= self.fg_threshold
-            neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1])
-            maxes[neg] = 0
-
-            I = np.where(maxes > 0)[0]
-            overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
-
-            overlaps = overlaps[pos | neg]
-            boxes = boxes.numpy()
-            boxes = boxes[pos | neg]
-            return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1))
-
-    def __getitem__(self, idx):
-        im, gt = self.dataset[idx]
-        boxes = self.generate_boxes(im)
-        boxes, labels = self._overlap_and_attribute(boxes, gt)
-
-        if True:
-            w, h = im.size
-            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
-            im = im.view(h, w, 3)
-            # put it from HWC to CHW format
-            # yikes, this transpose takes 80% of the loading time/CPU
-            im = im.transpose(0, 1).transpose(0, 2).contiguous()
-            im = im.float().div_(255)
-
-        return im, boxes, labels
-
-    def __len__(self):
-        return len(self.dataset)
-
-
-class BoxSelector(torch.utils.data.Dataset):
-    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25):
-        super(BoxSelector, self).__init__()
-        self.dataset = dataset
-        self.num_boxes = num_boxes
-        self.fg_fraction = fg_fraction
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        im, boxes, labels = self.dataset[idx]
-
-        boxes = boxes.numpy()
-        labels = labels.numpy()
-
-        bg = np.where(labels == 0)[0]
-        fg = np.where(labels != 0)[0]
-        nfg = min(len(fg), self.num_boxes*self.fg_fraction)
-        nbg = min(len(bg), self.num_boxes - nfg)
+train_loader = torch.utils.data.DataLoader(
+            train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn)
 
-        bg = bg[np.random.permutation(len(bg))[:nbg]]
-        fg = fg[np.random.permutation(len(fg))[:nfg]]
+class Features(nn.Container):
+  def __init__(self):
+    super(Features, self).__init__()
+    self.m = nn.Conv2d(3, 3, 3, 16, 1)
 
-        I = np.concatenate([fg, bg], axis=0)
+  def forward(self, x):
+    return self.m(x)
 
-        return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I])
+class Classifier(nn.Container):
+  def __init__(self):
+    super(Classifier, self).__init__()
+    self.m1 = nn.Linear(3*7*7, 21)
+    self.m2 = nn.Linear(3*7*7, 21*4)
 
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
 
-class ToPILImage(object):
-    """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W 
-    or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C
-    to a PIL.Image of range [0, 255]
-    """
-    def __call__(self, pic):
-        from PIL import Image, ImageOps
-        if isinstance(pic, np.ndarray):
-            # handle numpy array
-            img = Image.fromarray(pic)
-        else:
-            npimg = pic.mul(255).byte().numpy()
-            npimg = np.transpose(npimg, (1,2,0))
-            img = Image.fromarray(npimg)
-        return img
+def pooler(x, rois):
+  from roi_pooling import roi_pooling
+  x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
+  return x.view(x.size(0), -1)
 
-def make_grid(tensor, nrow=8, padding=2):
-    import math
-    """
-    Given a 4D mini-batch Tensor of shape (B x C x H x W),
-    or a list of images all of the same size,
-    makes a grid of images
-    """
-    tensorlist = None
-    if isinstance(tensor, list):
-        tensorlist = tensor
-        numImages = len(tensorlist)
-        size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size())
-        tensor = tensorlist[0].new(size)
-        for i in range(numImages):
-            tensor[i].copy_(tensorlist[i])
-    if tensor.dim() == 2: # single image H x W
-        tensor = tensor.view(1, tensor.size(0), tensor.size(1))
-    if tensor.dim() == 3: # single image
-        if tensor.size(0) == 1:
-            tensor = torch.cat((tensor, tensor, tensor), 0)
-        return tensor
-    if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images
-        tensor = torch.cat((tensor, tensor, tensor), 1)
-    # make the mini-batch of images into a grid
-    nmaps = tensor.size(0)
-    xmaps = min(nrow, nmaps)
-    ymaps = int(math.ceil(nmaps / xmaps))
-    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
-    grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max())
-    k = 0
-    for y in range(ymaps):
-        for x in range(xmaps):
-            if k >= nmaps:
-                break
-            grid.narrow(1, y*height+1+padding//2,height-padding)\
-                .narrow(2, x*width+1+padding//2, width-padding)\
-                .copy_(tensor[k])
-            k = k + 1
-    return grid
+class RPNClassifier(nn.Container):
+  def __init__(self, n):
+    super(RPNClassifier, self).__init__()
+    self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
+    self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
 
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
 
+rpn = RPN(RPNClassifier(3))
 
-ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25)
+frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn)
 
-def collate_fn(batch):
-    imgs, boxes, labels = zip(*batch)
-    max_size = [max(size) for size in zip(*[im.size() for im in imgs])]
-    new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0)
-    for im, im2 in zip(new_imgs, imgs):
-        im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2)
-    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
-    boxes = torch.from_numpy(boxes)
-    labels = torch.cat(labels, 0)
-    return new_imgs, boxes, labels
-
-train_loader = torch.utils.data.DataLoader(
-            ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)
+frcnn.train()
 
+optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9)
 
-def show(img, boxes, label, cls=None):
-    from PIL import Image, ImageDraw
-    #img, target = self.__getitem__(index)
-    if cls is None:
-        cls = ('__background__', # always index 0
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor')
 
-    draw = ImageDraw.Draw(img)
-    for obj, t in zip(boxes, label):
-        if t > 0:
-            draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
-            draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
-        else:
-            #pass
-            draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
-    img.show()
+#from IPython import embed; embed()
 
+for i, (im, gt) in tqdm(enumerate(train_loader)):
+  optimizer.zero_grad()
+  loss, scores, boxes = frcnn((im, gt))
+  loss.backward()
+  optimizer.step()
 
-for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
-    #grid = make_grid(img, 2, 1)
-    #grid = ToPILImage()(grid)
-    #grid.show()
-    #break
-    pass
-    #print('====')
-    #print(i)
-    #print(img.size())
-    #print(boxes.size())
-    #print(labels.size())
+#im, gt = train[0]
+#im = im.unsqueeze(0)
 
-#im, box, label = ds[10]
-#show(im,box,label)
+#loss, scores, boxes = frcnn((im, gt))
+#from IPython import embed; embed()
diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py
deleted file mode 100644
index 604c07e263..0000000000
--- a/fast_rcnn/main2.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.autograd as ag
-import torch.utils.trainer as trainer
-import torch.utils.data
-import numpy as np
-import torchvision.transforms as transforms
-from rpn import RPN
-import torch.optim as optim
-
-from roi_pooling import roi_pooling
-from voc import VOCDetection, TransformVOCDetectionAnnotation
-from faster_rcnn import FasterRCNN
-from tqdm import tqdm
-
-cls = ('__background__', # always index 0
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor')
-class_to_ind = dict(zip(cls, range(len(cls))))
-
-
-train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
-            transform=transforms.ToTensor(),
-            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
-
-def collate_fn(batch):
-    imgs, gt = zip(*batch)
-    return imgs[0].unsqueeze(0), gt[0]
-
-train_loader = torch.utils.data.DataLoader(
-            train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn)
-
-class Features(nn.Container):
-  def __init__(self):
-    super(Features, self).__init__()
-    self.m = nn.Conv2d(3, 3, 3, 16, 1)
-
-  def forward(self, x):
-    return self.m(x)
-
-class Classifier(nn.Container):
-  def __init__(self):
-    super(Classifier, self).__init__()
-    self.m1 = nn.Linear(3*7*7, 21)
-    self.m2 = nn.Linear(3*7*7, 21*4)
-
-  def forward(self, x):
-    return self.m1(x), self.m2(x)
-
-def pooler(x, rois):
-  from roi_pooling import roi_pooling
-  x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
-  return x.view(x.size(0), -1)
-
-class RPNClassifier(nn.Container):
-  def __init__(self, n):
-    super(RPNClassifier, self).__init__()
-    self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
-    self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
-
-  def forward(self, x):
-    return self.m1(x), self.m2(x)
-
-rpn = RPN(RPNClassifier(3))
-
-frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn)
-
-frcnn.train()
-
-optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9)
-
-
-#from IPython import embed; embed()
-
-for i, (im, gt) in tqdm(enumerate(train_loader)):
-  optimizer.zero_grad()
-  loss, scores, boxes = frcnn((im, gt))
-  loss.backward()
-  optimizer.step()
-
-#im, gt = train[0]
-#im = im.unsqueeze(0)
-
-#loss, scores, boxes = frcnn((im, gt))
-#from IPython import embed; embed()
diff --git a/fast_rcnn/main_old.py b/fast_rcnn/main_old.py
new file mode 100644
index 0000000000..0cb7795536
--- /dev/null
+++ b/fast_rcnn/main_old.py
@@ -0,0 +1,277 @@
+import torch
+import torch.nn as nn
+import torch.autograd as ag
+import torch.utils.trainer as trainer
+import torch.utils.data
+import numpy as np
+
+from roi_pooling import roi_pooling
+from voc import VOCDetection, TransformVOCDetectionAnnotation
+
+from tqdm import tqdm
+
+
+cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+class_to_ind = dict(zip(cls, range(len(cls))))
+
+
+train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+
+
+# TODO
+# add class information in dataset
+# separate in different files
+# remove hard-coding 21 from Sampler
+# cache the sampled boxes ?
+
+# image flip goes to the dataset class, not BoxSampler
+
+def bbox_overlaps(a, bb):
+  #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b
+  oo = []
+
+  for b in bb:
+
+    x1 = a.select(1,0).clone()
+    x1[x1.lt(b[0])] = b[0] 
+    y1 = a.select(1,1).clone()
+    y1[y1.lt(b[1])] = b[1]
+    x2 = a.select(1,2).clone()
+    x2[x2.gt(b[2])] = b[2]
+    y2 = a.select(1,3).clone()
+    y2[y2.gt(b[3])] = b[3]
+
+    w = x2-x1+1
+    h = y2-y1+1
+    inter = torch.mul(w,h).float()
+    aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float()
+    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
+
+    # intersection over union overlap
+    o = torch.div(inter , (aarea+barea-inter))
+    # set invalid entries to 0 overlap
+    o[w.lt(0)] = 0
+    o[h.lt(0)] = 0
+
+    oo += [o]
+
+  return torch.cat([o.view(-1,1) for o in oo],1)
+
+class BoxGenerator(object):
+    def __init__(self, num_boxes=2000):
+        super(BoxGenerator, self).__init__()
+        self.num_boxes = num_boxes
+
+    def __call__(self, im):
+        #h, w = im.size()[1:]
+        w, h = im.size
+        x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
+        y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
+    
+        x = x[0]
+        y = y[0]
+
+        return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
+
+
+class BoxSampler(torch.utils.data.Dataset):
+
+    def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), 
+            generate_boxes=BoxGenerator(num_boxes=10000)):
+        super(BoxSampler, self).__init__()
+        self.dataset = dataset
+        self.fg_threshold = fg_threshold
+        self.bg_threshold = bg_threshold
+        self.generate_boxes = generate_boxes
+
+    def _overlap_and_attribute(self, boxes, gt_roidb):
+
+        #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32)
+        overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32)
+
+        if gt_roidb is not None and gt_roidb['boxes'].size > 0:
+            gt_boxes = gt_roidb['boxes']
+            gt_classes = np.array(gt_roidb['gt_classes'])
+            gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy()
+            argmaxes = gt_overlaps.argmax(axis=1)
+            maxes = gt_overlaps.max(axis=1)
+
+            # remove low scoring
+            pos = maxes >= self.fg_threshold
+            neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1])
+            maxes[neg] = 0
+
+            I = np.where(maxes > 0)[0]
+            overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+
+            overlaps = overlaps[pos | neg]
+            boxes = boxes.numpy()
+            boxes = boxes[pos | neg]
+            return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1))
+
+    def __getitem__(self, idx):
+        im, gt = self.dataset[idx]
+        boxes = self.generate_boxes(im)
+        boxes, labels = self._overlap_and_attribute(boxes, gt)
+
+        if True:
+            w, h = im.size
+            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
+            im = im.view(h, w, 3)
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            im = im.transpose(0, 1).transpose(0, 2).contiguous()
+            im = im.float().div_(255)
+
+        return im, boxes, labels
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class BoxSelector(torch.utils.data.Dataset):
+    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25):
+        super(BoxSelector, self).__init__()
+        self.dataset = dataset
+        self.num_boxes = num_boxes
+        self.fg_fraction = fg_fraction
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        im, boxes, labels = self.dataset[idx]
+
+        boxes = boxes.numpy()
+        labels = labels.numpy()
+
+        bg = np.where(labels == 0)[0]
+        fg = np.where(labels != 0)[0]
+        nfg = min(len(fg), self.num_boxes*self.fg_fraction)
+        nbg = min(len(bg), self.num_boxes - nfg)
+
+        bg = bg[np.random.permutation(len(bg))[:nbg]]
+        fg = fg[np.random.permutation(len(fg))[:nfg]]
+
+        I = np.concatenate([fg, bg], axis=0)
+
+        return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I])
+
+
+class ToPILImage(object):
+    """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W 
+    or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C
+    to a PIL.Image of range [0, 255]
+    """
+    def __call__(self, pic):
+        from PIL import Image, ImageOps
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = Image.fromarray(pic)
+        else:
+            npimg = pic.mul(255).byte().numpy()
+            npimg = np.transpose(npimg, (1,2,0))
+            img = Image.fromarray(npimg)
+        return img
+
+def make_grid(tensor, nrow=8, padding=2):
+    import math
+    """
+    Given a 4D mini-batch Tensor of shape (B x C x H x W),
+    or a list of images all of the same size,
+    makes a grid of images
+    """
+    tensorlist = None
+    if isinstance(tensor, list):
+        tensorlist = tensor
+        numImages = len(tensorlist)
+        size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size())
+        tensor = tensorlist[0].new(size)
+        for i in range(numImages):
+            tensor[i].copy_(tensorlist[i])
+    if tensor.dim() == 2: # single image H x W
+        tensor = tensor.view(1, tensor.size(0), tensor.size(1))
+    if tensor.dim() == 3: # single image
+        if tensor.size(0) == 1:
+            tensor = torch.cat((tensor, tensor, tensor), 0)
+        return tensor
+    if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images
+        tensor = torch.cat((tensor, tensor, tensor), 1)
+    # make the mini-batch of images into a grid
+    nmaps = tensor.size(0)
+    xmaps = min(nrow, nmaps)
+    ymaps = int(math.ceil(nmaps / xmaps))
+    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
+    grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max())
+    k = 0
+    for y in range(ymaps):
+        for x in range(xmaps):
+            if k >= nmaps:
+                break
+            grid.narrow(1, y*height+1+padding//2,height-padding)\
+                .narrow(2, x*width+1+padding//2, width-padding)\
+                .copy_(tensor[k])
+            k = k + 1
+    return grid
+
+
+
+ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25)
+
+def collate_fn(batch):
+    imgs, boxes, labels = zip(*batch)
+    max_size = [max(size) for size in zip(*[im.size() for im in imgs])]
+    new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0)
+    for im, im2 in zip(new_imgs, imgs):
+        im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2)
+    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
+    boxes = torch.from_numpy(boxes)
+    labels = torch.cat(labels, 0)
+    return new_imgs, boxes, labels
+
+train_loader = torch.utils.data.DataLoader(
+            ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+
+def show(img, boxes, label, cls=None):
+    from PIL import Image, ImageDraw
+    #img, target = self.__getitem__(index)
+    if cls is None:
+        cls = ('__background__', # always index 0
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+
+    draw = ImageDraw.Draw(img)
+    for obj, t in zip(boxes, label):
+        if t > 0:
+            draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
+            draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
+        else:
+            #pass
+            draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
+    img.show()
+
+
+for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
+    #grid = make_grid(img, 2, 1)
+    #grid = ToPILImage()(grid)
+    #grid.show()
+    #break
+    pass
+    #print('====')
+    #print(i)
+    #print(img.size())
+    #print(boxes.size())
+    #print(labels.size())
+
+#im, box, label = ds[10]
+#show(im,box,label)

From 4058094721e3374b0c94ebac8f22d4fcc90f089b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 21:14:14 -0200
Subject: [PATCH 12/18] rename

---
 fast_rcnn/{model.py => model_old.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename fast_rcnn/{model.py => model_old.py} (100%)

diff --git a/fast_rcnn/model.py b/fast_rcnn/model_old.py
similarity index 100%
rename from fast_rcnn/model.py
rename to fast_rcnn/model_old.py

From a0061e8acfc086fbd87221c47ab1aa72caf57668 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 21:14:45 -0200
Subject: [PATCH 13/18] Cleaning up a bit

---
 fast_rcnn/README.md      |   5 ++
 fast_rcnn/faster_rcnn.py |   3 +-
 fast_rcnn/main.py        | 161 +++++++++++++++++++++++----------------
 fast_rcnn/model.py       |  50 ++++++++++++
 4 files changed, 153 insertions(+), 66 deletions(-)
 create mode 100644 fast_rcnn/README.md
 create mode 100644 fast_rcnn/model.py

diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md
new file mode 100644
index 0000000000..4389746ab3
--- /dev/null
+++ b/fast_rcnn/README.md
@@ -0,0 +1,5 @@
+# Faster R-CNN code example
+
+```python
+python main.py PATH_TO_DATASET
+```
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index f0e07455a2..86379f775a 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -21,8 +21,8 @@ def __init__(self,
     super(FasterRCNN, self).__init__()
     self.features = features
     self.roi_pooling = pooler
-    self.classifier = classifier
     self.rpn = rpn
+    self.classifier = classifier
     
     self.batch_size = batch_size
     self.fg_fraction = fg_fraction
@@ -32,7 +32,6 @@ def __init__(self,
     self.bg_threshold = bg_threshold
     self._num_classes = num_classes
 
-
   # should it support batched images ?
   def forward(self, x):
     if self.training is True:
diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index 604c07e263..2e993c1e85 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -1,28 +1,48 @@
+import argparse
+import time
+
 import torch
 import torch.nn as nn
-import torch.autograd as ag
-import torch.utils.trainer as trainer
 import torch.utils.data
-import numpy as np
 import torchvision.transforms as transforms
-from rpn import RPN
+
 import torch.optim as optim
 
-from roi_pooling import roi_pooling
 from voc import VOCDetection, TransformVOCDetectionAnnotation
-from faster_rcnn import FasterRCNN
+
+import importlib
+
+#from model import model
+
 from tqdm import tqdm
 
+parser = argparse.ArgumentParser(description='PyTorch Faster R-CNN Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--model', '-m', metavar='MODEL', default='model',
+                    help='file containing model definition '
+                    '(default: model)')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate')
+parser.add_argument('--momentum', default=0.01, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+
 cls = ('__background__', # always index 0
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor')
+       'aeroplane', 'bicycle', 'bird', 'boat',
+       'bottle', 'bus', 'car', 'cat', 'chair',
+       'cow', 'diningtable', 'dog', 'horse',
+       'motorbike', 'person', 'pottedplant',
+       'sheep', 'sofa', 'train', 'tvmonitor')
 class_to_ind = dict(zip(cls, range(len(cls))))
 
+args = parser.parse_args()
+model = importlib.import_module(args.model).model
 
-train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
+train = VOCDetection(args.data, 'train',
             transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
@@ -31,58 +51,71 @@ def collate_fn(batch):
     return imgs[0].unsqueeze(0), gt[0]
 
 train_loader = torch.utils.data.DataLoader(
-            train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn)
-
-class Features(nn.Container):
-  def __init__(self):
-    super(Features, self).__init__()
-    self.m = nn.Conv2d(3, 3, 3, 16, 1)
-
-  def forward(self, x):
-    return self.m(x)
-
-class Classifier(nn.Container):
-  def __init__(self):
-    super(Classifier, self).__init__()
-    self.m1 = nn.Linear(3*7*7, 21)
-    self.m2 = nn.Linear(3*7*7, 21*4)
-
-  def forward(self, x):
-    return self.m1(x), self.m2(x)
-
-def pooler(x, rois):
-  from roi_pooling import roi_pooling
-  x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
-  return x.view(x.size(0), -1)
-
-class RPNClassifier(nn.Container):
-  def __init__(self, n):
-    super(RPNClassifier, self).__init__()
-    self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
-    self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
-
-  def forward(self, x):
-    return self.m1(x), self.m2(x)
-
-rpn = RPN(RPNClassifier(3))
-
-frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn)
-
-frcnn.train()
-
-optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9)
-
+            train, batch_size=1, shuffle=True,
+            num_workers=0, collate_fn=collate_fn)
+
+
+
+optimizer = optim.SGD(model.parameters(), lr=args.lr, 
+                      momentum=args.momentum,
+                      weight_decay=args.weight_decay)
+
+def train(train_loader, model, optimizer, epoch):
+  batch_time = AverageMeter()
+  data_time = AverageMeter()
+  losses = AverageMeter()
+
+  model.train()
+  end = time.time()
+  for i, (im, gt) in (enumerate(train_loader)):
+    # measure data loading time
+    data_time.update(time.time() - end)
+
+    optimizer.zero_grad()
+    loss, scores, boxes = model((im, gt))
+    loss.backward()
+    optimizer.step()
+    
+    losses.update(loss.data[0], im.size(0))
+
+    # measure elapsed time
+    batch_time.update(time.time() - end)
+    end = time.time()
+    if i % args.print_freq == 0:
+      print('Epoch: [{0}][{1}/{2}]\t'
+            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+            'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+            'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+            #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+            #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'
+            .format(
+            epoch, i, len(train_loader), batch_time=batch_time,
+            data_time=data_time, loss=losses,
+            #top1=top1, top5=top5
+            ))
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+for epoch in range(0, 10):
+  train(train_loader, model, optimizer, epoch)
 
 #from IPython import embed; embed()
 
-for i, (im, gt) in tqdm(enumerate(train_loader)):
-  optimizer.zero_grad()
-  loss, scores, boxes = frcnn((im, gt))
-  loss.backward()
-  optimizer.step()
-
-#im, gt = train[0]
-#im = im.unsqueeze(0)
-
-#loss, scores, boxes = frcnn((im, gt))
-#from IPython import embed; embed()
+#if __name__ == '__main__':
+#  main()
diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py
new file mode 100644
index 0000000000..ba2e1c9acb
--- /dev/null
+++ b/fast_rcnn/model.py
@@ -0,0 +1,50 @@
+import torch.nn as nn
+from roi_pooling import roi_pooling as _roi_pooling
+
+from rpn import RPN as _RPN
+from faster_rcnn import FasterRCNN as _FasterRCNN
+
+class _Features(nn.Container):
+  def __init__(self):
+    super(_Features, self).__init__()
+    self.m = nn.Conv2d(3, 3, 3, 16, 1)
+
+  def forward(self, x):
+    return self.m(x)
+
+class _Classifier(nn.Container):
+  def __init__(self):
+    super(_Classifier, self).__init__()
+    self.m1 = nn.Linear(3*7*7, 21)
+    self.m2 = nn.Linear(3*7*7, 21*4)
+
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
+
+def _pooler(x, rois):
+  x = _roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0)
+  return x.view(x.size(0), -1)
+
+class _RPNClassifier(nn.Container):
+  def __init__(self, n):
+    super(_RPNClassifier, self).__init__()
+    self.m1 = nn.Conv2d(n, 18, 3, 1, 1)
+    self.m2 = nn.Conv2d(n, 36, 3, 1, 1)
+
+  def forward(self, x):
+    return self.m1(x), self.m2(x)
+
+_features = _Features()
+_classifier = _Classifier()
+_rpn_classifier = _RPNClassifier(3)
+
+_rpn = _RPN(
+    classifier=_rpn_classifier
+)
+
+model = _FasterRCNN(
+    features=_features,
+    pooler=_pooler,
+    classifier=_classifier,
+    rpn=_rpn
+)

From cfb643fe612ce66c091ae56dfb965cb2c8c94221 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 24 Dec 2016 21:47:55 -0200
Subject: [PATCH 14/18] Reduce default learning rate

---
 fast_rcnn/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index 2e993c1e85..a7b1a280d8 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -22,9 +22,9 @@
 parser.add_argument('--model', '-m', metavar='MODEL', default='model',
                     help='file containing model definition '
                     '(default: model)')
-parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+parser.add_argument('--lr', '--learning-rate', default=0.01, type=float,
                     metavar='LR', help='initial learning rate')
-parser.add_argument('--momentum', default=0.01, type=float, metavar='M',
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                     help='momentum')
 parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                     metavar='W', help='weight decay (default: 1e-4)')

From e36a93606f7eb241c406b06292256921fa8b5be8 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 25 Dec 2016 13:01:47 -0200
Subject: [PATCH 15/18] Fixes

---
 fast_rcnn/README.md      |  8 ++++++
 fast_rcnn/faster_rcnn.py | 12 ++++++---
 fast_rcnn/main.py        | 57 +++++++++++++++++++++++++++++++++++++---
 fast_rcnn/model.py       | 17 ++++++------
 fast_rcnn/rpn.py         |  6 +++--
 fast_rcnn/voc.py         | 19 +++++---------
 6 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md
index 4389746ab3..2d45a6711a 100644
--- a/fast_rcnn/README.md
+++ b/fast_rcnn/README.md
@@ -3,3 +3,11 @@
 ```python
 python main.py PATH_TO_DATASET
 ```
+
+## Things to add/change/consider
+* where to handle the image scaling. Need to scale the annotations, and also RPN filters the minimum size wrt the original image size, and not the scaled image
+* properly supporting flipping
+* best way to handle different parameters in RPN/FRCNN for train/eval modes
+* uniformize Variables, they should be provided by the user and not processed by me
+* should image scaling be handled in FasterRCNN class?
+* general code cleanup, lots of torch/numpy mixture
diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 86379f775a..60d24158b1 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -34,16 +34,21 @@ def __init__(self,
 
   # should it support batched images ?
   def forward(self, x):
-    if self.training is True:
+    #if self.training is True:
+    if isinstance(x, tuple):
       im, gt = x
     else:
       im = x
+      gt = None
+
+    assert im.size(0) == 1, 'only single element batches supported'
 
     feats = self.features(_tovar(im))
 
     roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt)
 
-    if self.training is True:
+    #if self.training is True:
+    if gt is not None:
       # append gt boxes and sample fg / bg boxes
       # proposal_target-layer.py
       all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt)
@@ -55,7 +60,8 @@ def forward(self, x):
     boxes = self.bbox_reg(roi_boxes, bbox_pred, im)
 
     # apply cls + bbox reg loss here
-    if self.training is True:
+    #if self.training is True:
+    if gt is not None:
       frcnn_loss = self.frcnn_loss(scores, bbox_pred, frcnn_labels, frcnn_bbox_targets)
       loss = frcnn_loss + rpn_loss
       return loss, scores, boxes
diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py
index a7b1a280d8..c1301e74aa 100644
--- a/fast_rcnn/main.py
+++ b/fast_rcnn/main.py
@@ -1,5 +1,6 @@
 import argparse
 import time
+#from copy import deepcopy
 
 import torch
 import torch.nn as nn
@@ -40,9 +41,15 @@
 class_to_ind = dict(zip(cls, range(len(cls))))
 
 args = parser.parse_args()
-model = importlib.import_module(args.model).model
+model = importlib.import_module(args.model).model()
+model_test = importlib.import_module(args.model).model()
+model_test.load_state_dict(model.state_dict())
 
-train = VOCDetection(args.data, 'train',
+train_data = VOCDetection(args.data, 'train',
+            transform=transforms.ToTensor(),
+            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
+
+val_data = VOCDetection(args.data, 'val',
             transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
 
@@ -51,10 +58,13 @@ def collate_fn(batch):
     return imgs[0].unsqueeze(0), gt[0]
 
 train_loader = torch.utils.data.DataLoader(
-            train, batch_size=1, shuffle=True,
+            train_data, batch_size=1, shuffle=True,
             num_workers=0, collate_fn=collate_fn)
 
 
+val_loader = torch.utils.data.DataLoader(
+            val_data, batch_size=1, shuffle=False,
+            num_workers=0, collate_fn=collate_fn)
 
 optimizer = optim.SGD(model.parameters(), lr=args.lr, 
                       momentum=args.momentum,
@@ -68,6 +78,8 @@ def train(train_loader, model, optimizer, epoch):
   model.train()
   end = time.time()
   for i, (im, gt) in (enumerate(train_loader)):
+    adjust_learning_rate(optimizer, epoch)
+
     # measure data loading time
     data_time.update(time.time() - end)
 
@@ -93,7 +105,45 @@ def train(train_loader, model, optimizer, epoch):
             data_time=data_time, loss=losses,
             #top1=top1, top5=top5
             ))
+      #global model_test
+      #assert model.state_dict() == model_test.state_dict()
+
+def validate(val_loader, model):
+  batch_time = AverageMeter()
+  losses = AverageMeter()
+
+  # switch to evaluate mode
+  model.eval()
+
+  end = time.time()
+
+  for i, (im, gt) in enumerate(val_loader):
+    loss, scores, boxes = model((im, gt))
+    losses.update(loss.data[0], im.size(0))
+    # measure elapsed time
+    batch_time.update(time.time() - end)
+    end = time.time()
+
+    if i % args.print_freq == 0:
+      print('Test: [{0}/{1}]\t'
+            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+            #'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+            'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+            #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+            #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'
+            .format(
+            i, len(val_loader), batch_time=batch_time,
+            #data_time=data_time, 
+            loss=losses,
+            #top1=top1, top5=top5
+            ))
+
 
+def adjust_learning_rate(optimizer, epoch):
+  """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+  lr = args.lr * (0.1 ** (epoch // 30))
+  for param_group in optimizer.state_dict()['param_groups']:
+    param_group['lr'] = lr
 
 class AverageMeter(object):
     """Computes and stores the average and current value"""
@@ -114,6 +164,7 @@ def update(self, val, n=1):
 
 for epoch in range(0, 10):
   train(train_loader, model, optimizer, epoch)
+  #validate(val_loader, model)
 
 #from IPython import embed; embed()
 
diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py
index ba2e1c9acb..88cd1f95e0 100644
--- a/fast_rcnn/model.py
+++ b/fast_rcnn/model.py
@@ -34,17 +34,18 @@ def __init__(self, n):
   def forward(self, x):
     return self.m1(x), self.m2(x)
 
-_features = _Features()
-_classifier = _Classifier()
-_rpn_classifier = _RPNClassifier(3)
+def model():
+  _features = _Features()
+  _classifier = _Classifier()
+  _rpn_classifier = _RPNClassifier(3)
 
-_rpn = _RPN(
+  _rpn = _RPN(
     classifier=_rpn_classifier
-)
-
-model = _FasterRCNN(
+  )
+  _model = _FasterRCNN(
     features=_features,
     pooler=_pooler,
     classifier=_classifier,
     rpn=_rpn
-)
+  )
+  return _model
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index bbbe2f6096..c3608871ab 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -44,6 +44,7 @@ def __init__(self,
 
   # output rpn probs as well
   def forward(self, im, feats, gt=None):
+    assert im.size(0) == 1, 'only single element batches supported'
     # improve
     # it is used in get_anchors and also present in roi_pooling
     self._feat_stride = round(im.size(3)/feats.size(3))
@@ -52,7 +53,8 @@ def forward(self, im, feats, gt=None):
     rpn_map, rpn_bbox_pred = self.rpn_classifier(feats)
     all_anchors = self.rpn_get_anchors(feats)
     rpn_loss = None
-    if self.training is True:
+    #if self.training is True:
+    if gt is not None:
       assert gt is not None
       rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt)
       # need to subsample boxes here
@@ -106,7 +108,7 @@ def rpn_targets(self, all_anchors, im, gt):
      
     # keep only inside anchors
     anchors = all_anchors[inds_inside, :]
-    assert anchors.shape[0] > 0
+    assert anchors.shape[0] > 0, '{0}x{1} -> {2}'.format(height,width,total_anchors)
 
     # label: 1 is positive, 0 is negative, -1 is dont care
     labels = np.empty((len(inds_inside), ), dtype=np.float32)
diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py
index 164b6b9dd4..bb2331db4f 100644
--- a/fast_rcnn/voc.py
+++ b/fast_rcnn/voc.py
@@ -16,32 +16,27 @@ def __init__(self, class_to_ind, keep_difficult=False):
         self.class_to_ind = class_to_ind
 
     def __call__(self, target):
-        #res = []
-        #res = {}
         boxes = []
         gt_classes = []
         for obj in target.iter('object'):
             difficult = int(obj.find('difficult').text) == 1
             if not self.keep_difficult and difficult:
                 continue
-            #name = obj.find('name').text
-            name = obj[0].text.lower().strip()
+            name = obj.find('name').text.lower().strip()
             bb = obj.find('bndbox')
-            #bbox = obj[4]
             bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text,
                 bb.find('xmax').text, bb.find('ymax').text])
-            # supposes the order is xmin, ymin, xmax, ymax
-            # attention with indices
-            #bndbox = [int(bb.text)-1 for bb in bbox]
 
-            #res += [bndbox + [name]]
-            #res += [bndbox + [class_to_ind[name]]]
             boxes += [torch.LongTensor(bndbox)]
             gt_classes += [self.class_to_ind[name]]
-
+  
+        size = target.find('size')
+        im_info = map(int,(size.find('height').text, size.find('width').text, 1))
+  
         res = {
             'boxes': torch.cat([b.view(1,-1) for b in boxes], 0),
-            'gt_classes':gt_classes
+            'gt_classes':gt_classes,
+            'im_info': im_info
         }
         return res
 

From e73ee5331c7fb186a00ffb7a779c4a06c9b34fc7 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 25 Dec 2016 13:05:13 -0200
Subject: [PATCH 16/18] Removing unnecessary files from tree

---
 fast_rcnn/main_old.py  | 277 -----------------------------------------
 fast_rcnn/model_old.py |  29 -----
 2 files changed, 306 deletions(-)
 delete mode 100644 fast_rcnn/main_old.py
 delete mode 100644 fast_rcnn/model_old.py

diff --git a/fast_rcnn/main_old.py b/fast_rcnn/main_old.py
deleted file mode 100644
index 0cb7795536..0000000000
--- a/fast_rcnn/main_old.py
+++ /dev/null
@@ -1,277 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.autograd as ag
-import torch.utils.trainer as trainer
-import torch.utils.data
-import numpy as np
-
-from roi_pooling import roi_pooling
-from voc import VOCDetection, TransformVOCDetectionAnnotation
-
-from tqdm import tqdm
-
-
-cls = ('__background__', # always index 0
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor')
-class_to_ind = dict(zip(cls, range(len(cls))))
-
-
-train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train',
-            target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
-
-
-# TODO
-# add class information in dataset
-# separate in different files
-# remove hard-coding 21 from Sampler
-# cache the sampled boxes ?
-
-# image flip goes to the dataset class, not BoxSampler
-
-def bbox_overlaps(a, bb):
-  #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b
-  oo = []
-
-  for b in bb:
-
-    x1 = a.select(1,0).clone()
-    x1[x1.lt(b[0])] = b[0] 
-    y1 = a.select(1,1).clone()
-    y1[y1.lt(b[1])] = b[1]
-    x2 = a.select(1,2).clone()
-    x2[x2.gt(b[2])] = b[2]
-    y2 = a.select(1,3).clone()
-    y2[y2.gt(b[3])] = b[3]
-
-    w = x2-x1+1
-    h = y2-y1+1
-    inter = torch.mul(w,h).float()
-    aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float()
-    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
-
-    # intersection over union overlap
-    o = torch.div(inter , (aarea+barea-inter))
-    # set invalid entries to 0 overlap
-    o[w.lt(0)] = 0
-    o[h.lt(0)] = 0
-
-    oo += [o]
-
-  return torch.cat([o.view(-1,1) for o in oo],1)
-
-class BoxGenerator(object):
-    def __init__(self, num_boxes=2000):
-        super(BoxGenerator, self).__init__()
-        self.num_boxes = num_boxes
-
-    def __call__(self, im):
-        #h, w = im.size()[1:]
-        w, h = im.size
-        x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1)
-        y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1)
-    
-        x = x[0]
-        y = y[0]
-
-        return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1)
-
-
-class BoxSampler(torch.utils.data.Dataset):
-
-    def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), 
-            generate_boxes=BoxGenerator(num_boxes=10000)):
-        super(BoxSampler, self).__init__()
-        self.dataset = dataset
-        self.fg_threshold = fg_threshold
-        self.bg_threshold = bg_threshold
-        self.generate_boxes = generate_boxes
-
-    def _overlap_and_attribute(self, boxes, gt_roidb):
-
-        #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32)
-        overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32)
-
-        if gt_roidb is not None and gt_roidb['boxes'].size > 0:
-            gt_boxes = gt_roidb['boxes']
-            gt_classes = np.array(gt_roidb['gt_classes'])
-            gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy()
-            argmaxes = gt_overlaps.argmax(axis=1)
-            maxes = gt_overlaps.max(axis=1)
-
-            # remove low scoring
-            pos = maxes >= self.fg_threshold
-            neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1])
-            maxes[neg] = 0
-
-            I = np.where(maxes > 0)[0]
-            overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
-
-            overlaps = overlaps[pos | neg]
-            boxes = boxes.numpy()
-            boxes = boxes[pos | neg]
-            return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1))
-
-    def __getitem__(self, idx):
-        im, gt = self.dataset[idx]
-        boxes = self.generate_boxes(im)
-        boxes, labels = self._overlap_and_attribute(boxes, gt)
-
-        if True:
-            w, h = im.size
-            im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
-            im = im.view(h, w, 3)
-            # put it from HWC to CHW format
-            # yikes, this transpose takes 80% of the loading time/CPU
-            im = im.transpose(0, 1).transpose(0, 2).contiguous()
-            im = im.float().div_(255)
-
-        return im, boxes, labels
-
-    def __len__(self):
-        return len(self.dataset)
-
-
-class BoxSelector(torch.utils.data.Dataset):
-    def __init__(self, dataset, num_boxes=128, fg_fraction=0.25):
-        super(BoxSelector, self).__init__()
-        self.dataset = dataset
-        self.num_boxes = num_boxes
-        self.fg_fraction = fg_fraction
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        im, boxes, labels = self.dataset[idx]
-
-        boxes = boxes.numpy()
-        labels = labels.numpy()
-
-        bg = np.where(labels == 0)[0]
-        fg = np.where(labels != 0)[0]
-        nfg = min(len(fg), self.num_boxes*self.fg_fraction)
-        nbg = min(len(bg), self.num_boxes - nfg)
-
-        bg = bg[np.random.permutation(len(bg))[:nbg]]
-        fg = fg[np.random.permutation(len(fg))[:nfg]]
-
-        I = np.concatenate([fg, bg], axis=0)
-
-        return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I])
-
-
-class ToPILImage(object):
-    """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W 
-    or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C
-    to a PIL.Image of range [0, 255]
-    """
-    def __call__(self, pic):
-        from PIL import Image, ImageOps
-        if isinstance(pic, np.ndarray):
-            # handle numpy array
-            img = Image.fromarray(pic)
-        else:
-            npimg = pic.mul(255).byte().numpy()
-            npimg = np.transpose(npimg, (1,2,0))
-            img = Image.fromarray(npimg)
-        return img
-
-def make_grid(tensor, nrow=8, padding=2):
-    import math
-    """
-    Given a 4D mini-batch Tensor of shape (B x C x H x W),
-    or a list of images all of the same size,
-    makes a grid of images
-    """
-    tensorlist = None
-    if isinstance(tensor, list):
-        tensorlist = tensor
-        numImages = len(tensorlist)
-        size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size())
-        tensor = tensorlist[0].new(size)
-        for i in range(numImages):
-            tensor[i].copy_(tensorlist[i])
-    if tensor.dim() == 2: # single image H x W
-        tensor = tensor.view(1, tensor.size(0), tensor.size(1))
-    if tensor.dim() == 3: # single image
-        if tensor.size(0) == 1:
-            tensor = torch.cat((tensor, tensor, tensor), 0)
-        return tensor
-    if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images
-        tensor = torch.cat((tensor, tensor, tensor), 1)
-    # make the mini-batch of images into a grid
-    nmaps = tensor.size(0)
-    xmaps = min(nrow, nmaps)
-    ymaps = int(math.ceil(nmaps / xmaps))
-    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
-    grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max())
-    k = 0
-    for y in range(ymaps):
-        for x in range(xmaps):
-            if k >= nmaps:
-                break
-            grid.narrow(1, y*height+1+padding//2,height-padding)\
-                .narrow(2, x*width+1+padding//2, width-padding)\
-                .copy_(tensor[k])
-            k = k + 1
-    return grid
-
-
-
-ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25)
-
-def collate_fn(batch):
-    imgs, boxes, labels = zip(*batch)
-    max_size = [max(size) for size in zip(*[im.size() for im in imgs])]
-    new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0)
-    for im, im2 in zip(new_imgs, imgs):
-        im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2)
-    boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0)
-    boxes = torch.from_numpy(boxes)
-    labels = torch.cat(labels, 0)
-    return new_imgs, boxes, labels
-
-train_loader = torch.utils.data.DataLoader(
-            ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn)
-
-
-def show(img, boxes, label, cls=None):
-    from PIL import Image, ImageDraw
-    #img, target = self.__getitem__(index)
-    if cls is None:
-        cls = ('__background__', # always index 0
-            'aeroplane', 'bicycle', 'bird', 'boat',
-            'bottle', 'bus', 'car', 'cat', 'chair',
-            'cow', 'diningtable', 'dog', 'horse',
-            'motorbike', 'person', 'pottedplant',
-            'sheep', 'sofa', 'train', 'tvmonitor')
-
-    draw = ImageDraw.Draw(img)
-    for obj, t in zip(boxes, label):
-        if t > 0:
-            draw.rectangle(obj[0:4].tolist(), outline=(255,0,0))
-            draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
-        else:
-            #pass
-            draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
-    img.show()
-
-
-for i, (img, boxes, labels) in tqdm(enumerate(train_loader)):
-    #grid = make_grid(img, 2, 1)
-    #grid = ToPILImage()(grid)
-    #grid.show()
-    #break
-    pass
-    #print('====')
-    #print(i)
-    #print(img.size())
-    #print(boxes.size())
-    #print(labels.size())
-
-#im, box, label = ds[10]
-#show(im,box,label)
diff --git a/fast_rcnn/model_old.py b/fast_rcnn/model_old.py
deleted file mode 100644
index 44d7dddf9b..0000000000
--- a/fast_rcnn/model_old.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch.nn as nn
-from roi_pooling import roi_pooling
-
-class Network(nn.Container):
-
-  def __init__(self, features, classifier):
-    super(Network, self).__init__()
-    self.features = features
-    self.classifier = classifier
-  
-  def forward(self, x):
-    images, rois = x
-    x = self.features(images)
-    x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0)
-    x = self.classifier(x)
-    return x
-
-def basic_net():
-  features = nn.Sequential(nn.Conv2d(3,16,3,16,1,1))
-  classifier = nn.Sequential(nn.Linear(3*3*16,10))
-  return Network(features, classifier)
-
-if __name__ == '__main__':
-  import torch
-  import torch.autograd
-  m = basic_net()
-  x = torch.autograd.Variable(torch.rand(1,3,224,224))
-  b = torch.autograd.Variable(torch.LongTensor([[0,1,50,200,200],[0,50,50,200,200]]))
-  o = m((x,b))

From 79c2402c4182c940850ed2649e8b1ae23a066460 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 25 Dec 2016 13:08:08 -0200
Subject: [PATCH 17/18] Rename

---
 fast_rcnn/faster_rcnn.py                  | 4 ++--
 fast_rcnn/rpn.py                          | 4 ++--
 fast_rcnn/{bbox_transform.py => utils.py} | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename fast_rcnn/{bbox_transform.py => utils.py} (100%)

diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py
index 60d24158b1..fbd9434e7d 100644
--- a/fast_rcnn/faster_rcnn.py
+++ b/fast_rcnn/faster_rcnn.py
@@ -4,10 +4,10 @@
 import numpy as np
 import numpy.random as npr
 
-from bbox_transform import \
+from utils import \
     bbox_transform, bbox_transform_inv, clip_boxes, bbox_overlaps
 
-from bbox_transform import to_var as _tovar
+from utils import to_var as _tovar
 
 # should handle multiple scales, how?
 class FasterRCNN(nn.Container):
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index c3608871ab..6faff8eaa6 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -5,10 +5,10 @@
 import numpy.random as npr
 
 # clean up environment
-from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
+from utils import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps
 from generate_anchors import generate_anchors
 
-from bbox_transform import to_var as _tovar
+from utils import to_var as _tovar
 
 from py_cpu_nms import py_cpu_nms as nms
 
diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/utils.py
similarity index 100%
rename from fast_rcnn/bbox_transform.py
rename to fast_rcnn/utils.py

From d8d378c31d2766009db400ac03f41dd837a56c2a Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 25 Dec 2016 16:28:46 -0200
Subject: [PATCH 18/18] minor changes

---
 fast_rcnn/README.md |  5 +++--
 fast_rcnn/rpn.py    | 15 ++++++++++-----
 fast_rcnn/voc.py    | 11 +++++++++--
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md
index 2d45a6711a..fb1aab88ba 100644
--- a/fast_rcnn/README.md
+++ b/fast_rcnn/README.md
@@ -6,8 +6,9 @@ python main.py PATH_TO_DATASET
 
 ## Things to add/change/consider
 * where to handle the image scaling. Need to scale the annotations, and also RPN filters the minimum size wrt the original image size, and not the scaled image
+* should image scaling be handled in FasterRCNN class?
 * properly supporting flipping
 * best way to handle different parameters in RPN/FRCNN for train/eval modes
-* uniformize Variables, they should be provided by the user and not processed by me
-* should image scaling be handled in FasterRCNN class?
+* uniformize Variables, they should be provided by the user and not processed by FasterRCNN/RPN classes
 * general code cleanup, lots of torch/numpy mixture
+* should I use a general config file?
diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py
index 6faff8eaa6..b7421a2fc1 100644
--- a/fast_rcnn/rpn.py
+++ b/fast_rcnn/rpn.py
@@ -66,10 +66,13 @@ def forward(self, im, feats, gt=None):
     # proposal_layer.py
     roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_pred, im)
     # only for visualization
-    #roi_boxes = all_anchors
+    if False:
+      roi_boxes = all_anchors
+      return _tovar((roi_boxes, scores, rpn_loss, rpn_labels))
 
     return _tovar((roi_boxes, scores, rpn_loss))
 
+
   # from faster rcnn py
   def rpn_get_anchors(self, im):
     height, width = im.size()[-2:]
@@ -248,7 +251,9 @@ def _unmap(data, count, inds, fill=0):
 
 def show(img, boxes, label):
     from PIL import Image, ImageDraw
+    import torchvision.transforms as transforms
     #img, target = self.__getitem__(index)
+    img = transforms.ToPILImage()(img)
     draw = ImageDraw.Draw(img)
     for obj, t in zip(boxes, label):
         #print(type(t))
@@ -258,8 +263,8 @@ def show(img, boxes, label):
             #draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0))
         #else:
         elif t == 0:
-            pass
-            #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
+            #pass
+            draw.rectangle(obj[0:4].tolist(), outline=(0,0,255))
     img.show()
 
 
@@ -293,7 +298,7 @@ def forward(self, x):
             transform=transforms.ToTensor(),
             target_transform=TransformVOCDetectionAnnotation(class_to_ind, False))
   
-  im, gt = train[11]
+  im, gt = train[100]
   im0 = im
 
   im = im.unsqueeze(0)
@@ -312,6 +317,6 @@ def forward(self, x):
   print loss
   loss.backward()
 
-  #show(im0, boxes.data, labels.data.int().tolist())
+  show(im0, boxes.data, labels.data.int().tolist())
 
   #from IPython import embed; embed()
diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py
index bb2331db4f..1eb0b0e0ed 100644
--- a/fast_rcnn/voc.py
+++ b/fast_rcnn/voc.py
@@ -9,6 +9,13 @@
 else:
     import xml.etree.ElementTree as ET
 
+def _flip_box(boxes, width):
+  boxes = boxes.clone()
+  oldx1 = boxes[:, 0].clone()
+  oldx2 = boxes[:, 2].clone()
+  boxes[:, 0] = width - oldx2 - 1
+  boxes[:, 2] = width - oldx1 - 1
+  return boxes
 
 class TransformVOCDetectionAnnotation(object):
     def __init__(self, class_to_ind, keep_difficult=False):
@@ -27,14 +34,14 @@ def __call__(self, target):
             bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text,
                 bb.find('xmax').text, bb.find('ymax').text])
 
-            boxes += [torch.LongTensor(bndbox)]
+            boxes += [bndbox]
             gt_classes += [self.class_to_ind[name]]
   
         size = target.find('size')
         im_info = map(int,(size.find('height').text, size.find('width').text, 1))
   
         res = {
-            'boxes': torch.cat([b.view(1,-1) for b in boxes], 0),
+            'boxes': torch.LongTensor(boxes),
             'gt_classes':gt_classes,
             'im_info': im_info
         }