From 10caef70219c3612aa1918c90b771a1880f54015 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Mon, 19 Dec 2016 09:49:29 -0200 Subject: [PATCH 01/18] Changes from yesterday --- fast_rcnn/main.py | 162 +++++++++++++++++++++++++++++++++++++++ fast_rcnn/model.py | 29 +++++++ fast_rcnn/roi_pooling.py | 43 +++++++++++ fast_rcnn/voc.py | 142 ++++++++++++++++++++++++++++++++++ 4 files changed, 376 insertions(+) create mode 100644 fast_rcnn/main.py create mode 100644 fast_rcnn/model.py create mode 100644 fast_rcnn/roi_pooling.py create mode 100644 fast_rcnn/voc.py diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py new file mode 100644 index 0000000000..2c07a72b9e --- /dev/null +++ b/fast_rcnn/main.py @@ -0,0 +1,162 @@ +import torch +import torch.nn as nn +import torch.autograd as ag +import torch.utils.trainer as trainer +import torch.utils.data +import numpy as np + +from roi_pooling import roi_pooling +from voc import VOCDetection, TransformVOCDetectionAnnotation + + +cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') +class_to_ind = dict(zip(cls, range(len(cls)))) + + +train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + +# two possibilities +# 1. have a new dataset class that samples random boxes and outputs, like the batch provider +# 2. let the dataset do it internally +# lets go for 1 + +# image flip goes to the dataset class, not BoxSampler + +def bbox_overlaps(a, bb): + #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b + oo = [] + + for b in bb: + + x1 = a.select(1,0).clone() + x1[x1.lt(b[0])] = b[0] + y1 = a.select(1,1).clone() + y1[y1.lt(b[1])] = b[1] + x2 = a.select(1,2).clone() + x2[x2.gt(b[2])] = b[2] + y2 = a.select(1,3).clone() + y2[y2.gt(b[3])] = b[3] + + w = x2-x1+1 + h = y2-y1+1 + inter = torch.mul(w,h).float() + aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() + barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) + + # intersection over union overlap + o = torch.div(inter , (aarea+barea-inter)) + # set invalid entries to 0 overlap + o[w.lt(0)] = 0 + o[h.lt(0)] = 0 + + oo += [o] + + return torch.cat([o.view(-1,1) for o in oo],1) + +def _generate_boxes(self, im): + #h, w = im.size()[1:] + w, h = im.size + x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) + y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) + + x = x[0] + y = y[0] + + return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) + + +class BoxSampler(torch.utils.data.Dataset): + + def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, bg_threshold=(0.0,0.5), generate_boxes=_generate_boxes): + super(BoxSampler, self).__init__() + self.dataset = dataset + self.num_boxes = num_boxes + self.fg_fraction = fg_fraction + self.fg_threshold = fg_threshold + self.bg_threshold = bg_threshold + self.generate_boxes = generate_boxes + + def _overlap_and_attribute(self, boxes, gt_roidb): + + #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32) + overlaps = np.zeros((boxes.size(0), 20), dtype=np.float32) + + if gt_roidb is not None and gt_roidb['boxes'].size > 0: + gt_boxes = gt_roidb['boxes'] + gt_classes = np.array(gt_roidb['gt_classes']) + #gt_overlaps = bbox_overlaps(boxes.astype(np.float),gt_boxes.astype(np.float)) + gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy() + argmaxes = gt_overlaps.argmax(axis=1) + maxes = gt_overlaps.max(axis=1) + + # remove low scoring + pos = maxes >= self.fg_threshold + neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1]) + maxes[neg] = 0 + # need to take care of bg_threshold + + I = np.where(maxes > 0)[0] + #I = np.where()[0] + overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] + + overlaps = overlaps[pos | neg] + boxes = boxes.numpy() + boxes = boxes[pos | neg] + #argmaxes[maxes == 0] = 0 + #return torch.from_numpy(argmaxes) + return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1)) + + def __getitem__(self, idx): + #super(BoxSampler, self).__getitem__(idx) + im, gt = self.dataset[idx] + boxes = self.generate_boxes(self, im) + boxes, labels = self._overlap_and_attribute(boxes, gt) + return im, boxes, labels + + def __len__(self): + return len(self.dataset) + + +ds = BoxSampler(train, 64*32, fg_threshold=0.75) + +def collate_fn(batch): + imgs, targets = zip(*batch) + imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0) + targets = torch.LongTensor([[i] + t for i, t in enumerate(targets, 0)]) + + return imgs, targets + +train_loader = torch.utils.data.DataLoader( + train, batch_size=2, shuffle=True, num_workers=1, collate_fn=collate_fn) + + +def show(img, boxes, label, cls=None): + from PIL import Image, ImageDraw + #img, target = self.__getitem__(index) + if cls is None: + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + + draw = ImageDraw.Draw(img) + for obj, t in zip(boxes, label): + if t > 0: + draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) + draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) + else: + pass + #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + img.show() + + +#im, box, label = ds[10] +#show(im,box,label) diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py new file mode 100644 index 0000000000..44d7dddf9b --- /dev/null +++ b/fast_rcnn/model.py @@ -0,0 +1,29 @@ +import torch.nn as nn +from roi_pooling import roi_pooling + +class Network(nn.Container): + + def __init__(self, features, classifier): + super(Network, self).__init__() + self.features = features + self.classifier = classifier + + def forward(self, x): + images, rois = x + x = self.features(images) + x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0) + x = self.classifier(x) + return x + +def basic_net(): + features = nn.Sequential(nn.Conv2d(3,16,3,16,1,1)) + classifier = nn.Sequential(nn.Linear(3*3*16,10)) + return Network(features, classifier) + +if __name__ == '__main__': + import torch + import torch.autograd + m = basic_net() + x = torch.autograd.Variable(torch.rand(1,3,224,224)) + b = torch.autograd.Variable(torch.LongTensor([[0,1,50,200,200],[0,50,50,200,200]])) + o = m((x,b)) diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py new file mode 100644 index 0000000000..f56cbd6fd4 --- /dev/null +++ b/fast_rcnn/roi_pooling.py @@ -0,0 +1,43 @@ +import torch +import torch.nn as nn +import torch.autograd as ag +import math + +# approximation for the adaptive max pooling which is currently missing from nn +# doesn't work if the input is smaller than size +def adaptive_max_pool(input, size): + s = input.size()[2:] + assert(s[0]>= size[0] and s[1] >= size[1]) + ratio = [float(x)/y for x,y in zip(s, size)] + kernel_size = [int(math.ceil(x)) for x in ratio] + stride = kernel_size + remainder = [x*y-z for x, y, z in zip(kernel_size, size, s)] + padding = [int(math.floor((x+1)/2)) for x in remainder] + return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input) + #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input) + +def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): + assert(rois.dim() == 2) + assert(rois.size(1) == 5) + output = [] + rois = rois.data.float() + num_rois = rois.size(0) + + rois[:,1:].mul_(spatial_scale) + rois = rois.long() + for i in range(num_rois): + roi = rois[i] + im_idx = roi[0] + im = input.narrow(0, im_idx, 1)[..., roi[2]:roi[4], roi[1]:roi[3]] + output.append(adaptive_max_pool(im, size)) + + return torch.cat(output, 0) + +if __name__ == '__main__': + input = ag.Variable(torch.rand(1,1,10,10), requires_grad=True) + rois = ag.Variable(torch.LongTensor([[0,1,2,7,8],[0,3,3,8,8]]),requires_grad=False) + #rois = ag.Variable(torch.LongTensor([[0,3,3,8,8]]),requires_grad=False) + + out = roi_pooling(input, rois, size=(3,3)) + out.backward(out.data.clone().uniform_()) + diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py new file mode 100644 index 0000000000..0929646e10 --- /dev/null +++ b/fast_rcnn/voc.py @@ -0,0 +1,142 @@ +import torch +import torch.utils.data as data +from PIL import Image, ImageDraw +import os +import os.path +import sys +if sys.version_info[0] == 2: + import xml.etree.cElementTree as ET +else: + import xml.etree.ElementTree as ET + + +class TransformVOCDetectionAnnotation(object): + def __init__(self, class_to_ind, keep_difficult=False): + self.keep_difficult = keep_difficult + self.class_to_ind = class_to_ind + + def __call__(self, target): + #res = [] + #res = {} + boxes = [] + gt_classes = [] + for obj in target.iter('object'): + difficult = int(obj.find('difficult').text) == 1 + if not self.keep_difficult and difficult: + continue + #name = obj.find('name').text + name = obj[0].text.lower().strip() + #bb = obj.find('bndbox') + bbox = obj[4] + #bndbox = [bb.find('xmin').text, bb.find('ymin').text, + # bb.find('xmax').text, bb.find('ymax').text] + # supposes the order is xmin, ymin, xmax, ymax + # attention with indices + bndbox = [int(bb.text)-1 for bb in bbox] + + #res += [bndbox + [name]] + #res += [bndbox + [class_to_ind[name]]] + boxes += [torch.LongTensor(bndbox)] + gt_classes += [self.class_to_ind[name]] + + res = { + 'boxes': torch.cat([b.view(1,-1) for b in boxes], 0), + 'gt_classes':gt_classes + } + return res + +class VOCSegmentation(data.Dataset): + def __init__(self, root, image_set, transform=None, target_transform=None): + self.root = root + self.image_set = image_set + self.transform = transform + self.target_transform = target_transform + + dataset_name = 'VOC2007' + self._annopath = os.path.join(self.root, dataset_name, 'SegmentationClass', '%s.png') + self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg') + self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt') + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip('\n') for x in self.ids] + + def __getitem__(self, index): + img_id = self.ids[index] + + target = Image.open(self._annopath % img_id)#.convert('RGB') + + img = Image.open(self._imgpath % img_id).convert('RGB') + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.ids) + + +class VOCDetection(data.Dataset): + def __init__(self, root, image_set, transform=None, target_transform=None): + self.root = root + self.image_set = image_set + self.transform = transform + self.target_transform = target_transform + + dataset_name = 'VOC2007' + self._annopath = os.path.join(self.root, dataset_name, 'Annotations', '%s.xml') + self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg') + self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Main', '%s.txt') + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip('\n') for x in self.ids] + + def __getitem__(self, index): + img_id = self.ids[index] + + target = ET.parse(self._annopath % img_id).getroot() + + img = Image.open(self._imgpath % img_id).convert('RGB') + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.ids) + + def show(self, index): + img, target = self.__getitem__(index) + draw = ImageDraw.Draw(img) + for obj in target: + draw.rectangle(obj[0:4], outline=(255,0,0)) + draw.text(obj[0:2], obj[4], fill=(0,255,0)) + img.show() + +if __name__ == '__main__': + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + class_to_ind = dict(zip(cls, range(len(cls)))) + + ds = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + print(len(ds)) + img, target = ds[0] + print(target) + #ds.show(1) + #dss = VOCSegmentation('/home/francisco/work/datasets/VOCdevkit/', 'train') + #img, target = dss[0] + + #img.show() + #print(target_transform(target)) From 55b2bb0994f44257b9a9b1372df315cf025295b2 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Mon, 19 Dec 2016 09:49:52 -0200 Subject: [PATCH 02/18] Seems to work --- fast_rcnn/main.py | 42 ++++++++++++++++++++++++++++++++++-------- fast_rcnn/voc.py | 10 +++++----- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index 2c07a72b9e..f38067aba7 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -8,6 +8,8 @@ from roi_pooling import roi_pooling from voc import VOCDetection, TransformVOCDetectionAnnotation +from tqdm import tqdm + cls = ('__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', @@ -85,7 +87,7 @@ def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, b def _overlap_and_attribute(self, boxes, gt_roidb): #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32) - overlaps = np.zeros((boxes.size(0), 20), dtype=np.float32) + overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32) if gt_roidb is not None and gt_roidb['boxes'].size > 0: gt_boxes = gt_roidb['boxes'] @@ -117,23 +119,40 @@ def __getitem__(self, idx): im, gt = self.dataset[idx] boxes = self.generate_boxes(self, im) boxes, labels = self._overlap_and_attribute(boxes, gt) + + if True: + w, h = im.size + im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) + im = im.view(h, w, 3) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + im = im.transpose(0, 1).transpose(0, 2).contiguous() + im = im.float().div_(255) + return im, boxes, labels def __len__(self): return len(self.dataset) -ds = BoxSampler(train, 64*32, fg_threshold=0.75) +ds = BoxSampler(train, 64, fg_threshold=0.5) def collate_fn(batch): - imgs, targets = zip(*batch) - imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0) - targets = torch.LongTensor([[i] + t for i, t in enumerate(targets, 0)]) - - return imgs, targets + imgs, boxes, labels = zip(*batch) + max_size = [max(size) for size in zip(*[im.size() for im in imgs])] + new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0) + for im, im2 in zip(new_imgs, imgs): + im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2) + #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0) + #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]) + #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)] + boxes = np.concatenate([np.column_stack((np.full(t.size(0), i), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) + boxes = torch.from_numpy(boxes) + labels = torch.cat(labels, 0) + return new_imgs, boxes, labels train_loader = torch.utils.data.DataLoader( - train, batch_size=2, shuffle=True, num_workers=1, collate_fn=collate_fn) + ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn) def show(img, boxes, label, cls=None): @@ -157,6 +176,13 @@ def show(img, boxes, label, cls=None): #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) img.show() +for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): + pass + #print('====') + #print(i) + #print(img.size()) + #print(boxes.size()) + #print(labels.size()) #im, box, label = ds[10] #show(im,box,label) diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py index 0929646e10..757f6ceef2 100644 --- a/fast_rcnn/voc.py +++ b/fast_rcnn/voc.py @@ -26,13 +26,13 @@ def __call__(self, target): continue #name = obj.find('name').text name = obj[0].text.lower().strip() - #bb = obj.find('bndbox') - bbox = obj[4] - #bndbox = [bb.find('xmin').text, bb.find('ymin').text, - # bb.find('xmax').text, bb.find('ymax').text] + bb = obj.find('bndbox') + #bbox = obj[4] + bndbox = [int(bb.find('xmin').text), int(bb.find('ymin').text), + int(bb.find('xmax').text), int(bb.find('ymax').text)] # supposes the order is xmin, ymin, xmax, ymax # attention with indices - bndbox = [int(bb.text)-1 for bb in bbox] + #bndbox = [int(bb.text)-1 for bb in bbox] #res += [bndbox + [name]] #res += [bndbox + [class_to_ind[name]]] From faa3b4e719601dc18b9b7bdb19f847cfb8c8a6e8 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Mon, 19 Dec 2016 12:03:54 -0200 Subject: [PATCH 03/18] Change generator --- fast_rcnn/main.py | 78 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index f38067aba7..fd6581324d 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -23,11 +23,6 @@ train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) -# two possibilities -# 1. have a new dataset class that samples random boxes and outputs, like the batch provider -# 2. let the dataset do it internally -# lets go for 1 - # image flip goes to the dataset class, not BoxSampler def bbox_overlaps(a, bb): @@ -61,25 +56,29 @@ def bbox_overlaps(a, bb): return torch.cat([o.view(-1,1) for o in oo],1) -def _generate_boxes(self, im): - #h, w = im.size()[1:] - w, h = im.size - x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) - y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) +class BoxGenerator(object): + def __init__(self, num_boxes=2000): + super(BoxGenerator, self).__init__() + self.num_boxes = num_boxes + + def __call__(self, im): + #h, w = im.size()[1:] + w, h = im.size + x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) + y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) - x = x[0] - y = y[0] + x = x[0] + y = y[0] - return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) + return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) class BoxSampler(torch.utils.data.Dataset): - def __init__(self, dataset, num_boxes=128, fg_fraction=0.25, fg_threshold=0.5, bg_threshold=(0.0,0.5), generate_boxes=_generate_boxes): + def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), + generate_boxes=BoxGenerator(num_boxes=10000)): super(BoxSampler, self).__init__() self.dataset = dataset - self.num_boxes = num_boxes - self.fg_fraction = fg_fraction self.fg_threshold = fg_threshold self.bg_threshold = bg_threshold self.generate_boxes = generate_boxes @@ -117,10 +116,10 @@ def _overlap_and_attribute(self, boxes, gt_roidb): def __getitem__(self, idx): #super(BoxSampler, self).__getitem__(idx) im, gt = self.dataset[idx] - boxes = self.generate_boxes(self, im) + boxes = self.generate_boxes(im) boxes, labels = self._overlap_and_attribute(boxes, gt) - if True: + if False: w, h = im.size im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) im = im.view(h, w, 3) @@ -135,7 +134,37 @@ def __len__(self): return len(self.dataset) -ds = BoxSampler(train, 64, fg_threshold=0.5) +class BoxSelector(torch.utils.data.Dataset): + def __init__(self, dataset, num_boxes=128, fg_fraction=0.25): + super(BoxSelector, self).__init__() + self.dataset = dataset + self.num_boxes = num_boxes + self.fg_fraction = fg_fraction + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + im, boxes, labels = self.dataset[idx] + + boxes = boxes.numpy() + labels = labels.numpy() + + bg = np.where(labels == 0)[0] + fg = np.where(labels != 0)[0] + nfg = min(len(fg), self.num_boxes*self.fg_fraction) + nbg = min(len(bg), self.num_boxes - nfg) + + bg = bg[np.random.permutation(len(bg))[:nbg]] + fg = fg[np.random.permutation(len(fg))[:nfg]] + + I = np.concatenate([fg, bg], axis=0) + + return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I]) + + + +ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25) def collate_fn(batch): imgs, boxes, labels = zip(*batch) @@ -146,7 +175,7 @@ def collate_fn(batch): #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0) #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]) #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)] - boxes = np.concatenate([np.column_stack((np.full(t.size(0), i), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) + boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) boxes = torch.from_numpy(boxes) labels = torch.cat(labels, 0) return new_imgs, boxes, labels @@ -176,13 +205,14 @@ def show(img, boxes, label, cls=None): #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) img.show() -for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): - pass + +#for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): +# pass #print('====') #print(i) #print(img.size()) #print(boxes.size()) #print(labels.size()) -#im, box, label = ds[10] -#show(im,box,label) +im, box, label = ds[10] +show(im,box,label) From f2e92489b4152da6f6d85f1bdee58e25c2f19eaf Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Thu, 22 Dec 2016 10:57:40 -0200 Subject: [PATCH 04/18] fast rcnn No bbox regression --- fast_rcnn/main.py | 91 ++++++++++++++++++++++++++++++++++++++--------- fast_rcnn/voc.py | 4 +-- 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index fd6581324d..0cb7795536 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -23,6 +23,13 @@ train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + +# TODO +# add class information in dataset +# separate in different files +# remove hard-coding 21 from Sampler +# cache the sampled boxes ? + # image flip goes to the dataset class, not BoxSampler def bbox_overlaps(a, bb): @@ -91,7 +98,6 @@ def _overlap_and_attribute(self, boxes, gt_roidb): if gt_roidb is not None and gt_roidb['boxes'].size > 0: gt_boxes = gt_roidb['boxes'] gt_classes = np.array(gt_roidb['gt_classes']) - #gt_overlaps = bbox_overlaps(boxes.astype(np.float),gt_boxes.astype(np.float)) gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy() argmaxes = gt_overlaps.argmax(axis=1) maxes = gt_overlaps.max(axis=1) @@ -100,26 +106,21 @@ def _overlap_and_attribute(self, boxes, gt_roidb): pos = maxes >= self.fg_threshold neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1]) maxes[neg] = 0 - # need to take care of bg_threshold I = np.where(maxes > 0)[0] - #I = np.where()[0] overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] overlaps = overlaps[pos | neg] boxes = boxes.numpy() boxes = boxes[pos | neg] - #argmaxes[maxes == 0] = 0 - #return torch.from_numpy(argmaxes) return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1)) def __getitem__(self, idx): - #super(BoxSampler, self).__getitem__(idx) im, gt = self.dataset[idx] boxes = self.generate_boxes(im) boxes, labels = self._overlap_and_attribute(boxes, gt) - if False: + if True: w, h = im.size im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) im = im.view(h, w, 3) @@ -163,6 +164,63 @@ def __getitem__(self, idx): return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I]) +class ToPILImage(object): + """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W + or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C + to a PIL.Image of range [0, 255] + """ + def __call__(self, pic): + from PIL import Image, ImageOps + if isinstance(pic, np.ndarray): + # handle numpy array + img = Image.fromarray(pic) + else: + npimg = pic.mul(255).byte().numpy() + npimg = np.transpose(npimg, (1,2,0)) + img = Image.fromarray(npimg) + return img + +def make_grid(tensor, nrow=8, padding=2): + import math + """ + Given a 4D mini-batch Tensor of shape (B x C x H x W), + or a list of images all of the same size, + makes a grid of images + """ + tensorlist = None + if isinstance(tensor, list): + tensorlist = tensor + numImages = len(tensorlist) + size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size()) + tensor = tensorlist[0].new(size) + for i in range(numImages): + tensor[i].copy_(tensorlist[i]) + if tensor.dim() == 2: # single image H x W + tensor = tensor.view(1, tensor.size(0), tensor.size(1)) + if tensor.dim() == 3: # single image + if tensor.size(0) == 1: + tensor = torch.cat((tensor, tensor, tensor), 0) + return tensor + if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images + tensor = torch.cat((tensor, tensor, tensor), 1) + # make the mini-batch of images into a grid + nmaps = tensor.size(0) + xmaps = min(nrow, nmaps) + ymaps = int(math.ceil(nmaps / xmaps)) + height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) + grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max()) + k = 0 + for y in range(ymaps): + for x in range(xmaps): + if k >= nmaps: + break + grid.narrow(1, y*height+1+padding//2,height-padding)\ + .narrow(2, x*width+1+padding//2, width-padding)\ + .copy_(tensor[k]) + k = k + 1 + return grid + + ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25) @@ -172,9 +230,6 @@ def collate_fn(batch): new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0) for im, im2 in zip(new_imgs, imgs): im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2) - #imgs = torch.cat([t.view(1, *t.size()) for t in imgs], 0) - #boxes = torch.LongTensor([[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)]) - #boxes = [[[i]*t.size(0)] + t.tolist() for i, t in enumerate(boxes, 0)] boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) boxes = torch.from_numpy(boxes) labels = torch.cat(labels, 0) @@ -201,18 +256,22 @@ def show(img, boxes, label, cls=None): draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) else: - pass - #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + #pass + draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) img.show() -#for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): -# pass +for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): + #grid = make_grid(img, 2, 1) + #grid = ToPILImage()(grid) + #grid.show() + #break + pass #print('====') #print(i) #print(img.size()) #print(boxes.size()) #print(labels.size()) -im, box, label = ds[10] -show(im,box,label) +#im, box, label = ds[10] +#show(im,box,label) diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py index 757f6ceef2..164b6b9dd4 100644 --- a/fast_rcnn/voc.py +++ b/fast_rcnn/voc.py @@ -28,8 +28,8 @@ def __call__(self, target): name = obj[0].text.lower().strip() bb = obj.find('bndbox') #bbox = obj[4] - bndbox = [int(bb.find('xmin').text), int(bb.find('ymin').text), - int(bb.find('xmax').text), int(bb.find('ymax').text)] + bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text, + bb.find('xmax').text, bb.find('ymax').text]) # supposes the order is xmin, ymin, xmax, ymax # attention with indices #bndbox = [int(bb.text)-1 for bb in bbox] From 3b3f1aeb9c106230cc2cd42d6e0a17da002b5718 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Thu, 22 Dec 2016 10:58:13 -0200 Subject: [PATCH 05/18] Starting to prototype faster rcnn --- fast_rcnn/bbox_transform.py | 83 +++++++++++++++ fast_rcnn/faster_rcnn.py | 203 ++++++++++++++++++++++++++++++++++++ 2 files changed, 286 insertions(+) create mode 100644 fast_rcnn/bbox_transform.py create mode 100644 fast_rcnn/faster_rcnn.py diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py new file mode 100644 index 0000000000..358cdc9e07 --- /dev/null +++ b/fast_rcnn/bbox_transform.py @@ -0,0 +1,83 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np + +def bbox_transform(ex_rois, gt_rois): + ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 + ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 + ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths + ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights + + gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 + gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 + gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths + gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights + + targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = np.log(gt_widths / ex_widths) + targets_dh = np.log(gt_heights / ex_heights) + + targets = np.vstack( + (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() + return targets + +def bbox_transform_inv(boxes, deltas): + if boxes.shape[0] == 0: + return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) + + boxes = boxes.astype(deltas.dtype, copy=False) + + widths = boxes[:, 2] - boxes[:, 0] + 1.0 + heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + dx = deltas[:, 0::4] + dy = deltas[:, 1::4] + dw = deltas[:, 2::4] + dh = deltas[:, 3::4] + + pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] + pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] + pred_w = np.exp(dw) * widths[:, np.newaxis] + pred_h = np.exp(dh) * heights[:, np.newaxis] + + pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w + # y2 + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h + + return pred_boxes + +def clip_boxes(boxes, im_shape): + """ + Clip boxes to image boundaries. + """ + + # x1 >= 0 + boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + # y1 >= 0 + boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + # x2 < im_shape[1] + boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + # y2 < im_shape[0] + boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + return boxes + +def filter_boxes(boxes, min_size): + """Remove all boxes with any side smaller than min_size.""" + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + keep = np.where((ws >= min_size) & (hs >= min_size))[0] + return keep diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py new file mode 100644 index 0000000000..73a10a98eb --- /dev/null +++ b/fast_rcnn/faster_rcnn.py @@ -0,0 +1,203 @@ +import torch.nn as nn +import numpy as np + +# clean up environment +from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes + +class FasterRCNN(nn.Container): + + def __init__(self): + self.rpn_param = { + '_feat_stride':16 + } + + # need to have a train and test mode + # should it support batched images ? + # need to pass target as argument only in train mode + def forward(self, x): + if self.train is True: + im, gt = x + # call model.train() here ? + else + im = x + + feats = self._features(im) + + # improve + # it is used in get_anchors and also present in roi_pooling + self._feat_stride = round(im.size(4)/feats.size(4)) + # rpn + # put in a separate function + rpn_map, rpn_bbox_transform = self._rpn_classifier(feats) + all_anchors = self.rpn_get_anchors(im) + #rpn_boxes = self.rpn_estimate(all_anchors, rpn_map) + if self.train is True: + rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) + # need to subsample boxes here + rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets) + + # roi proposal + # clip, sort, pre nms topk, nms, after nms topk + # proposal_layer.py + # roi_boxes = self.get_roi_boxes(rpn_map, rpn_boxes) + roi_boxes = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform) + + if self.train is True: + # append gt boxes and sample fg / bg boxes + # proposal_target-layer.py + roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) + + # r-cnn + regions = self._roi_pooling(feats, roi_boxes) + scores, bbox_transform = self._classifier(regions) + + boxes = self.bbox_reg(roi_boxes, bbox_transform) + + # apply cls + bbox reg loss here + if self.train is True: + frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets) + loss = frcnn_loss + rpn_loss + return loss, scores, boxes + + return scores, boxes + + # the user define their model in here + def _features(self, x): + # _feat_stride should be defined / inferred from here + pass + def _classifier(self, x): + pass + def _roi_pooling(self, x): + pass + def _rpn_classifier(self, x): + pass + + # from faster rcnn py + def rpn_get_anchors(self, im): + height, width = im.size()[-2:] + # 1. Generate proposals from bbox deltas and shifted anchors + shift_x = np.arange(0, width) * self._feat_stride + shift_y = np.arange(0, height) * self._feat_stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel())).transpose() + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = self._num_anchors + K = shifts.shape[0] + all_anchors = (self._anchors.reshape((1, A, 4)) + + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + return all_anchors + + # restructure because we don't want -1 in labels + # shouldn't we instead keep only the bboxes for which labels >= 0? + def rpn_targets(self, all_anchors, im, gt): + total_anchors = all_anchors.shape[0] + gt_boxes = gt['boxes'] + + height, width = im.size()[-2:] + # only keep anchors inside the image + _allowed_border = 0 + inds_inside = np.where( + (all_anchors[:, 0] >= -_allowed_border) & + (all_anchors[:, 1] >= -_allowed_border) & + (all_anchors[:, 2] < width + _allowed_border) & # width + (all_anchors[:, 3] < height + _allowed_border) # height + )[0] + + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.empty((len(inds_inside), ), dtype=np.float32) + labels.fill(-1) + + # overlaps between the anchors and the gt boxes + # overlaps (ex, gt) + overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy() + argmax_overlaps = overlaps.argmax(axis=1) + max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] + gt_argmax_overlaps = overlaps.argmax(axis=0) + gt_max_overlaps = overlaps[gt_argmax_overlaps, + np.arange(overlaps.shape[1])] + gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + + # assign bg labels first so that positive labels can clobber them + labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 + + # fg label: for each gt, anchor with highest overlap + labels[gt_argmax_overlaps] = 1 + + # fg label: above threshold IOU + labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 + + # subsample positive labels if we have too many + num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg: + disable_inds = npr.choice( + fg_inds, size=(len(fg_inds) - num_fg), replace=False) + labels[disable_inds] = -1 + + # subsample negative labels if we have too many + num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) + bg_inds = np.where(labels == 0)[0] + if len(bg_inds) > num_bg: + disable_inds = npr.choice( + bg_inds, size=(len(bg_inds) - num_bg), replace=False) + labels[disable_inds] = -1 + + #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) + #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) + bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) + + # map up to original set of anchors + labels = _unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) + + return labels, bbox_targets + + + # I need to know the original image size (or have the scaling factor) + def get_roi_boxes(self, all_anchors, rpn_map, rpn_bbox_deltas) + + bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) + + # the first set of _num_anchors channels are bg probs + # the second set are the fg probs, which we want + #scores = bottom[0].data[:, self._num_anchors:, :, :] + scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) + + # Convert anchors into proposals via bbox transformations + proposals = bbox_transform_inv(anchors, bbox_deltas) + + # 2. clip predicted boxes to image + proposals = clip_boxes(proposals, im_info[:2]) + + # 3. remove predicted boxes with either height or width < threshold + # (NOTE: convert min_size to input image scale stored in im_info[2]) + keep = filter_boxes(proposals, min_size * im_info[2]) + proposals = proposals[keep, :] + scores = scores[keep] + + # 4. sort all (proposal, score) pairs by score from highest to lowest + # 5. take top pre_nms_topN (e.g. 6000) + order = scores.ravel().argsort()[::-1] + if pre_nms_topN > 0: + order = order[:pre_nms_topN] + proposals = proposals[order, :] + scores = scores[order] + + # 6. apply nms (e.g. threshold = 0.7) + # 7. take after_nms_topN (e.g. 300) + # 8. return the top proposals (-> RoIs top) + keep = nms(np.hstack((proposals, scores)), nms_thresh) + if post_nms_topN > 0: + keep = keep[:post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep] + + return roi_boxes From c653216c77f210b6458b6ace2d8cbc1a75dad891 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Fri, 23 Dec 2016 13:14:36 -0200 Subject: [PATCH 06/18] rpn runs Need to test for correctness --- fast_rcnn/bbox_transform.py | 35 ++++ fast_rcnn/faster_rcnn.py | 275 +++++++++++++----------------- fast_rcnn/generate_anchors.py | 105 ++++++++++++ fast_rcnn/py_cpu_nms.py | 38 +++++ fast_rcnn/rpn.py | 309 ++++++++++++++++++++++++++++++++++ 5 files changed, 603 insertions(+), 159 deletions(-) create mode 100644 fast_rcnn/generate_anchors.py create mode 100644 fast_rcnn/py_cpu_nms.py create mode 100644 fast_rcnn/rpn.py diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py index 358cdc9e07..2775d1a3a4 100644 --- a/fast_rcnn/bbox_transform.py +++ b/fast_rcnn/bbox_transform.py @@ -5,6 +5,7 @@ # Written by Ross Girshick # -------------------------------------------------------- +import torch import numpy as np def bbox_transform(ex_rois, gt_rois): @@ -81,3 +82,37 @@ def filter_boxes(boxes, min_size): hs = boxes[:, 3] - boxes[:, 1] + 1 keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep + + +# torch tensors +def bbox_overlaps(a, bb): + oo = [] + + for b in bb: + + x1 = a.select(1,0).clone() + x1[x1.lt(b[0])] = b[0] + y1 = a.select(1,1).clone() + y1[y1.lt(b[1])] = b[1] + x2 = a.select(1,2).clone() + x2[x2.gt(b[2])] = b[2] + y2 = a.select(1,3).clone() + y2[y2.gt(b[3])] = b[3] + + w = x2-x1+1 + h = y2-y1+1 + inter = torch.mul(w,h).float() + aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() + barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) + + # intersection over union overlap + o = torch.div(inter , (aarea+barea-inter)) + # set invalid entries to 0 overlap + o[w.lt(0)] = 0 + o[h.lt(0)] = 0 + + oo += [o] + + return torch.cat([o.view(-1,1) for o in oo],1) + + diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 73a10a98eb..33e9ad1918 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -1,21 +1,19 @@ import torch.nn as nn import numpy as np -# clean up environment -from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes - +# should handle multiple scales, how? class FasterRCNN(nn.Container): def __init__(self): - self.rpn_param = { - '_feat_stride':16 - } + super(FasterRCNN, self).__init__() + self.batch_size = 128 + self.fg_fraction = 0.25 + self.fg_threshold = 0.5 + self.bg_threshold = (0, 0.5) - # need to have a train and test mode # should it support batched images ? - # need to pass target as argument only in train mode def forward(self, x): - if self.train is True: + if self.training is True: im, gt = x # call model.train() here ? else @@ -23,26 +21,9 @@ def forward(self, x): feats = self._features(im) - # improve - # it is used in get_anchors and also present in roi_pooling - self._feat_stride = round(im.size(4)/feats.size(4)) - # rpn - # put in a separate function - rpn_map, rpn_bbox_transform = self._rpn_classifier(feats) - all_anchors = self.rpn_get_anchors(im) - #rpn_boxes = self.rpn_estimate(all_anchors, rpn_map) - if self.train is True: - rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) - # need to subsample boxes here - rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets) - - # roi proposal - # clip, sort, pre nms topk, nms, after nms topk - # proposal_layer.py - # roi_boxes = self.get_roi_boxes(rpn_map, rpn_boxes) - roi_boxes = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform) - - if self.train is True: + roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt) + + if self.training is True: # append gt boxes and sample fg / bg boxes # proposal_target-layer.py roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) @@ -54,7 +35,7 @@ def forward(self, x): boxes = self.bbox_reg(roi_boxes, bbox_transform) # apply cls + bbox reg loss here - if self.train is True: + if self.training is True: frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets) loss = frcnn_loss + rpn_loss return loss, scores, boxes @@ -63,141 +44,117 @@ def forward(self, x): # the user define their model in here def _features(self, x): - # _feat_stride should be defined / inferred from here pass def _classifier(self, x): pass def _roi_pooling(self, x): pass - def _rpn_classifier(self, x): - pass - # from faster rcnn py - def rpn_get_anchors(self, im): - height, width = im.size()[-2:] - # 1. Generate proposals from bbox deltas and shifted anchors - shift_x = np.arange(0, width) * self._feat_stride - shift_y = np.arange(0, height) * self._feat_stride - shift_x, shift_y = np.meshgrid(shift_x, shift_y) - shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), - shift_x.ravel(), shift_y.ravel())).transpose() - # add A anchors (1, A, 4) to - # cell K shifts (K, 1, 4) to get - # shift anchors (K, A, 4) - # reshape to (K*A, 4) shifted anchors - A = self._num_anchors - K = shifts.shape[0] - all_anchors = (self._anchors.reshape((1, A, 4)) + - shifts.reshape((1, K, 4)).transpose((1, 0, 2))) - all_anchors = all_anchors.reshape((K * A, 4)) - return all_anchors - - # restructure because we don't want -1 in labels - # shouldn't we instead keep only the bboxes for which labels >= 0? - def rpn_targets(self, all_anchors, im, gt): - total_anchors = all_anchors.shape[0] + def frcnn_targets(self, all_rois, im, gt): gt_boxes = gt['boxes'] - - height, width = im.size()[-2:] - # only keep anchors inside the image - _allowed_border = 0 - inds_inside = np.where( - (all_anchors[:, 0] >= -_allowed_border) & - (all_anchors[:, 1] >= -_allowed_border) & - (all_anchors[:, 2] < width + _allowed_border) & # width - (all_anchors[:, 3] < height + _allowed_border) # height - )[0] - - # keep only inside anchors - anchors = all_anchors[inds_inside, :] - - # label: 1 is positive, 0 is negative, -1 is dont care - labels = np.empty((len(inds_inside), ), dtype=np.float32) - labels.fill(-1) - - # overlaps between the anchors and the gt boxes - # overlaps (ex, gt) - overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy() - argmax_overlaps = overlaps.argmax(axis=1) - max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] - gt_argmax_overlaps = overlaps.argmax(axis=0) - gt_max_overlaps = overlaps[gt_argmax_overlaps, - np.arange(overlaps.shape[1])] - gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + gt_labels = gt['gt_classes'] + #zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) + #all_rois = np.vstack( + # (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) + #) + all_rois = np.vstack(all_rois, gt_boxes) + zeros = np.zeros((all_rois.shape[0], 1), dtype=all_rois.dtype) + all_rois = np.hstack((zeros, all_rois)) - # assign bg labels first so that positive labels can clobber them - labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 - - # fg label: for each gt, anchor with highest overlap - labels[gt_argmax_overlaps] = 1 - - # fg label: above threshold IOU - labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 + num_images = 1 + rois_per_image = self.batch_size / num_images + fg_rois_per_image = np.round(self.fg_fraction * rois_per_image) + + # Sample rois with classification labels and bounding box regression + # targets + labels, rois, bbox_targets = _sample_rois( + all_rois, gt_boxes, gt_labels, fg_rois_per_image, + rois_per_image, self._num_classes) + + return all_rois, labels, rois, bbox_targets - # subsample positive labels if we have too many - num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) - fg_inds = np.where(labels == 1)[0] - if len(fg_inds) > num_fg: - disable_inds = npr.choice( - fg_inds, size=(len(fg_inds) - num_fg), replace=False) - labels[disable_inds] = -1 - - # subsample negative labels if we have too many - num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) - bg_inds = np.where(labels == 0)[0] - if len(bg_inds) > num_bg: - disable_inds = npr.choice( - bg_inds, size=(len(bg_inds) - num_bg), replace=False) - labels[disable_inds] = -1 - - #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) - #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) - bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) - - # map up to original set of anchors - labels = _unmap(labels, total_anchors, inds_inside, fill=-1) - bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) - - return labels, bbox_targets - - - # I need to know the original image size (or have the scaling factor) - def get_roi_boxes(self, all_anchors, rpn_map, rpn_bbox_deltas) - - bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) - - # the first set of _num_anchors channels are bg probs - # the second set are the fg probs, which we want - #scores = bottom[0].data[:, self._num_anchors:, :, :] - scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) - - # Convert anchors into proposals via bbox transformations - proposals = bbox_transform_inv(anchors, bbox_deltas) - - # 2. clip predicted boxes to image - proposals = clip_boxes(proposals, im_info[:2]) - - # 3. remove predicted boxes with either height or width < threshold - # (NOTE: convert min_size to input image scale stored in im_info[2]) - keep = filter_boxes(proposals, min_size * im_info[2]) - proposals = proposals[keep, :] - scores = scores[keep] - - # 4. sort all (proposal, score) pairs by score from highest to lowest - # 5. take top pre_nms_topN (e.g. 6000) - order = scores.ravel().argsort()[::-1] - if pre_nms_topN > 0: - order = order[:pre_nms_topN] - proposals = proposals[order, :] - scores = scores[order] - - # 6. apply nms (e.g. threshold = 0.7) - # 7. take after_nms_topN (e.g. 300) - # 8. return the top proposals (-> RoIs top) - keep = nms(np.hstack((proposals, scores)), nms_thresh) - if post_nms_topN > 0: - keep = keep[:post_nms_topN] - proposals = proposals[keep, :] - scores = scores[keep] - - return roi_boxes +def _get_bbox_regression_labels(bbox_target_data, num_classes): + """Bounding-box regression targets (bbox_target_data) are stored in a + compact form N x (class, tx, ty, tw, th) + This function expands those targets into the 4-of-4*K representation used + by the network (i.e. only one class has non-zero targets). + Returns: + bbox_target (ndarray): N x 4K blob of regression targets + bbox_inside_weights (ndarray): N x 4K blob of loss weights + """ + + clss = bbox_target_data[:, 0] + bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) + bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) + inds = np.where(clss > 0)[0] + for ind in inds: + cls = clss[ind] + start = 4 * cls + end = start + 4 + bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] + return bbox_targets + + +def _compute_targets(ex_rois, gt_rois, labels): + """Compute bounding-box regression targets for an image.""" + + assert ex_rois.shape[0] == gt_rois.shape[0] + assert ex_rois.shape[1] == 4 + assert gt_rois.shape[1] == 4 + + targets = bbox_transform(ex_rois, gt_rois) + if False: #cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) + / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) + return np.hstack( + (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) + +def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes): + """Generate a random sample of RoIs comprising foreground and background + examples. + """ + # overlaps: (rois x gt_boxes) + overlaps = bbox_overlaps( + np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), + np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) + gt_assignment = overlaps.argmax(axis=1) + max_overlaps = overlaps.max(axis=1) + #labels = gt_boxes[gt_assignment, 4] + labels = gt_labels[gt_assignment] + + # Select foreground RoIs as those with >= FG_THRESH overlap + fg_inds = np.where(max_overlaps >= self.fg_threshold)[0] + # Guard against the case when an image has fewer than fg_rois_per_image + # foreground RoIs + fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) + # Sample foreground regions without replacement + if fg_inds.size > 0: + fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_inds = np.where((max_overlaps < self.bg_threshold[1]) & + (max_overlaps >= self.bg_threshold[0]))[0] + # Compute number of background RoIs to take from this image (guarding + # against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) + # Sample background regions without replacement + if bg_inds.size > 0: + bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) + + # The indices that we're selecting (both fg and bg) + keep_inds = np.append(fg_inds, bg_inds) + # Select sampled values from various arrays: + labels = labels[keep_inds] + # Clamp labels for the background RoIs to 0 + labels[fg_rois_per_this_image:] = 0 + rois = all_rois[keep_inds] + + bbox_target_data = _compute_targets( + rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) + + bbox_targets = \ + _get_bbox_regression_labels(bbox_target_data, num_classes) + + return labels, rois, bbox_targets diff --git a/fast_rcnn/generate_anchors.py b/fast_rcnn/generate_anchors.py new file mode 100644 index 0000000000..1125a801fe --- /dev/null +++ b/fast_rcnn/generate_anchors.py @@ -0,0 +1,105 @@ +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + +import numpy as np + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +#array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + +def generate_anchors(base_size=16, ratios=[0.5, 1, 2], + scales=2**np.arange(3, 6)): + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, 15, 15) window. + """ + + base_anchor = np.array([1, 1, base_size, base_size]) - 1 + ratio_anchors = _ratio_enum(base_anchor, ratios) + anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) + for i in xrange(ratio_anchors.shape[0])]) + return anchors + +def _whctrs(anchor): + """ + Return width, height, x center, and y center for an anchor (window). + """ + + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """ + Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack((x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1))) + return anchors + +def _ratio_enum(anchor, ratios): + """ + Enumerate a set of anchors for each aspect ratio wrt an anchor. + """ + + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + +def _scale_enum(anchor, scales): + """ + Enumerate a set of anchors for each scale wrt an anchor. + """ + + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + +if __name__ == '__main__': + import time + t = time.time() + a = generate_anchors() + print time.time() - t + print a + from IPython import embed; embed() diff --git a/fast_rcnn/py_cpu_nms.py b/fast_rcnn/py_cpu_nms.py new file mode 100644 index 0000000000..54e7b25fef --- /dev/null +++ b/fast_rcnn/py_cpu_nms.py @@ -0,0 +1,38 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py new file mode 100644 index 0000000000..6e1f61d15d --- /dev/null +++ b/fast_rcnn/rpn.py @@ -0,0 +1,309 @@ +#import torch +import torch.nn as nn +from torch.autograd import Variable +import numpy as np +import numpy.random as npr + +# clean up environment +from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps +from generate_anchors import generate_anchors + +from py_cpu_nms import py_cpu_nms as nms + +class RPN(nn.Container): + + def __init__(self): + super(RPN, self).__init__() + + anchor_scales = (8, 16, 32) + self._anchors = generate_anchors(scales=np.array(anchor_scales)) + self._num_anchors = self._anchors.shape[0] + + self.negative_overlap = 0.3 + self.positive_overlap = 0.7 + self.fg_fraction = 0.5 + self.batch_size = 256 + + # used for both train and test + self.nms_thresh = 0.7 + self.pre_nms_topN = 12000 + self.post_nms_topN = 2000 + self.min_size = 16 + + + # output rpn probs as well + def forward(self, im, feats, gt=None): + # improve + # it is used in get_anchors and also present in roi_pooling + self._feat_stride = round(im.size(3)/feats.size(3)) + # rpn + # put in a separate function + rpn_map, rpn_bbox_transform = self._rpn_classifier(feats) + all_anchors = self.rpn_get_anchors(feats) + rpn_loss = None + if self.training is True: + assert gt is not None + rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) + # need to subsample boxes here + rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets) + + # roi proposal + # clip, sort, pre nms topk, nms, after nms topk + # params are different for train and test + # proposal_layer.py + roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform, im) + # only for visualization + #roi_boxes = all_anchors + + #return roi_boxes, scores, rpn_loss + return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss + #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \ + # Variable(torch.from_numpy(rpn_labels)) + + def _rpn_classifier(self, x): + x = Variable(x, requires_grad=True) + m1 = nn.Conv2d(3, 18, 3, 1, 1) + m2 = nn.Conv2d(3, 36, 3, 1, 1) + return m1(x), m2(x) + #pass + + # from faster rcnn py + def rpn_get_anchors(self, im): + height, width = im.size()[-2:] + # 1. Generate proposals from bbox deltas and shifted anchors + shift_x = np.arange(0, width) * self._feat_stride + shift_y = np.arange(0, height) * self._feat_stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel())).transpose() + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = self._num_anchors + K = shifts.shape[0] + all_anchors = (self._anchors.reshape((1, A, 4)) + + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + return all_anchors + + # restructure because we don't want -1 in labels + # shouldn't we instead keep only the bboxes for which labels >= 0? + def rpn_targets(self, all_anchors, im, gt): + total_anchors = all_anchors.shape[0] + gt_boxes = gt['boxes'] + + height, width = im.size()[-2:] + # only keep anchors inside the image + _allowed_border = 0 + inds_inside = np.where( + (all_anchors[:, 0] >= -_allowed_border) & + (all_anchors[:, 1] >= -_allowed_border) & + (all_anchors[:, 2] < width + _allowed_border) & # width + (all_anchors[:, 3] < height + _allowed_border) # height + )[0] + + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.empty((len(inds_inside), ), dtype=np.float32) + labels.fill(-1) + + # overlaps between the anchors and the gt boxes + # overlaps (ex, gt) + #overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy() + overlaps = bbox_overlaps(torch.from_numpy(anchors), gt_boxes).numpy() + gt_boxes = gt_boxes.numpy() + argmax_overlaps = overlaps.argmax(axis=1) + max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] + gt_argmax_overlaps = overlaps.argmax(axis=0) + gt_max_overlaps = overlaps[gt_argmax_overlaps, + np.arange(overlaps.shape[1])] + gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + + # assign bg labels first so that positive labels can clobber them + labels[max_overlaps < self.negative_overlap] = 0 + + # fg label: for each gt, anchor with highest overlap + labels[gt_argmax_overlaps] = 1 + + # fg label: above threshold IOU + labels[max_overlaps >= self.positive_overlap] = 1 + + # subsample positive labels if we have too many + num_fg = int(self.fg_fraction * self.batch_size) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg: + disable_inds = npr.choice( + fg_inds, size=(len(fg_inds) - num_fg), replace=False) + labels[disable_inds] = -1 + + # subsample negative labels if we have too many + num_bg = self.batch_size - np.sum(labels == 1) + bg_inds = np.where(labels == 0)[0] + if len(bg_inds) > num_bg: + disable_inds = npr.choice( + bg_inds, size=(len(bg_inds) - num_bg), replace=False) + labels[disable_inds] = -1 + + #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) + #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) + bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) + + # map up to original set of anchors + labels = _unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) + + return labels, bbox_targets + + # I need to know the original image size (or have the scaling factor) + def get_roi_boxes(self, anchors, rpn_map, rpn_bbox_deltas, im): + # TODO fix this!!! + im_info = (100, 100, 1) + + bbox_deltas = rpn_bbox_deltas.data.numpy() + bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) + + # the first set of _num_anchors channels are bg probs + # the second set are the fg probs, which we want + #scores = bottom[0].data[:, self._num_anchors:, :, :] + scores = rpn_map.data[:, self._num_anchors:, :, :].numpy() + scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) + + # Convert anchors into proposals via bbox transformations + proposals = bbox_transform_inv(anchors, bbox_deltas) + + # 2. clip predicted boxes to image + proposals = clip_boxes(proposals, im.size()[-2:]) + + # 3. remove predicted boxes with either height or width < threshold + # (NOTE: convert min_size to input image scale stored in im_info[2]) + keep = filter_boxes(proposals, self.min_size * im_info[2]) + proposals = proposals[keep, :] + scores = scores[keep] + + # 4. sort all (proposal, score) pairs by score from highest to lowest + # 5. take top pre_nms_topN (e.g. 6000) + order = scores.ravel().argsort()[::-1] + if self.pre_nms_topN > 0: + order = order[:self.pre_nms_topN] + proposals = proposals[order, :] + scores = scores[order] + + # 6. apply nms (e.g. threshold = 0.7) + # 7. take after_nms_topN (e.g. 300) + # 8. return the top proposals (-> RoIs top) + keep = nms(np.hstack((proposals, scores)), self.nms_thresh) + if self.post_nms_topN > 0: + keep = keep[:self.post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep] + + return proposals, scores + + def rpn_loss(self, rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets): + height, width = rpn_map.size()[-2:] + + rpn_map = rpn_map.view(-1, 2, height, width).permute(0,2,3,1).contiguous().view(-1, 2) + labels = torch.from_numpy(rpn_labels).long() # convert properly + labels = labels.view(1, height, width, -1).permute(0, 3, 1, 2).contiguous() + labels = labels.view(-1) + + idx = labels.ge(0).nonzero()[:,0] + rpn_map = rpn_map.index_select(0, Variable(idx, requires_grad=False)) + labels = labels.index_select(0, idx) + labels = Variable(labels, requires_grad=False) + + rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets) + rpn_bbox_targets = rpn_bbox_targets.view(1, height, width, -1).permute(0, 3, 1, 2) + rpn_bbox_targets = Variable(rpn_bbox_targets, requires_grad=False) + + cls_crit = nn.CrossEntropyLoss() + reg_crit = nn.SmoothL1Loss() + cls_loss = cls_crit(rpn_map, labels) + # verify normalization and sigma + reg_loss = reg_crit(rpn_bbox_transform, rpn_bbox_targets) + + loss = cls_loss + reg_loss + return loss + +def _unmap(data, count, inds, fill=0): + """ Unmap a subset of item (data) back to the original set of items (of + size count) """ + if len(data.shape) == 1: + ret = np.empty((count, ), dtype=np.float32) + ret.fill(fill) + ret[inds] = data + else: + ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) + ret.fill(fill) + ret[inds, :] = data + return ret + + +def show(img, boxes, label): + from PIL import Image, ImageDraw + #img, target = self.__getitem__(index) + draw = ImageDraw.Draw(img) + for obj, t in zip(boxes, label): + #print(type(t)) + if t == 1: + #print(t) + draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) + #draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) + #else: + elif t == 0: + pass + #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + img.show() + + + +if __name__ == '__main__': + import torch + from voc import VOCDetection, TransformVOCDetectionAnnotation + rpn = RPN() + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + class_to_ind = dict(zip(cls, range(len(cls)))) + + + train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + + im, gt = train[11] + im0 = im + + if True: + w, h = im.size + im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) + im = im.view(h, w, 3) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + im = im.transpose(0, 1).transpose(0, 2).contiguous() + im = im.float().div_(255).unsqueeze(0) + + + + feats = torch.rand(1,3,im.size(2)/16, im.size(3)/16) + print(feats.size()) + print(im.size()) + + #rpn.eval() + rpn.train() + import time + t = time.time() + #boxes, scores, loss, labels = rpn(im, feats, gt) + boxes, scores, loss = rpn(im, feats, gt) + print time.time() - t + print loss + loss.backward() + + #show(im0, boxes.data, labels.data.int().tolist()) + + #from IPython import embed; embed() From 3bee8e6fa82864201cba12068b1abe7cad9a0b8e Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 11:51:08 -0200 Subject: [PATCH 07/18] frcnn runs --- fast_rcnn/bbox_transform.py | 6 +++ fast_rcnn/faster_rcnn.py | 73 +++++++++++++++++++++++++++++-------- fast_rcnn/main2.py | 46 +++++++++++++++++++++++ fast_rcnn/rpn.py | 4 +- 4 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 fast_rcnn/main2.py diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py index 2775d1a3a4..c134cea059 100644 --- a/fast_rcnn/bbox_transform.py +++ b/fast_rcnn/bbox_transform.py @@ -86,6 +86,12 @@ def filter_boxes(boxes, min_size): # torch tensors def bbox_overlaps(a, bb): + if isinstance(a, np.ndarray): + a = torch.from_numpy(a) + + if isinstance(bb, np.ndarray): + bb = torch.from_numpy(bb) + oo = [] for b in bb: diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 33e9ad1918..30d21cc184 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -1,6 +1,15 @@ +import torch import torch.nn as nn +from torch.autograd import Variable import numpy as np +import numpy.random as npr +from rpn import RPN +from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps + +m1 = nn.Conv2d(3, 3, 3, 16, 1) +m2 = nn.Linear(3*3*3, 21) +m3 = nn.Linear(3*3*3, 21*4) # should handle multiple scales, how? class FasterRCNN(nn.Container): @@ -10,33 +19,35 @@ def __init__(self): self.fg_fraction = 0.25 self.fg_threshold = 0.5 self.bg_threshold = (0, 0.5) + self._num_classes = 21 + self.rpn = RPN() # should it support batched images ? def forward(self, x): if self.training is True: im, gt = x # call model.train() here ? - else + else: im = x - feats = self._features(im) + feats = self._features(_tovar(im)) roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt) if self.training is True: # append gt boxes and sample fg / bg boxes # proposal_target-layer.py - roi_boxes, frcnn_labels, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) + all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) # r-cnn regions = self._roi_pooling(feats, roi_boxes) scores, bbox_transform = self._classifier(regions) - boxes = self.bbox_reg(roi_boxes, bbox_transform) + boxes = self.bbox_reg(roi_boxes, bbox_transform, im) # apply cls + bbox reg loss here if self.training is True: - frcnn_loss = self.frcnn_loss(scores, boxes, frcnn_labels, frcnn_bbox_targets) + frcnn_loss = self.frcnn_loss(scores, bbox_transform, frcnn_labels, frcnn_bbox_targets) loss = frcnn_loss + rpn_loss return loss, scores, boxes @@ -44,20 +55,33 @@ def forward(self, x): # the user define their model in here def _features(self, x): - pass + return m1(x) def _classifier(self, x): - pass - def _roi_pooling(self, x): - pass + return m2(x), m3(x) + def _roi_pooling(self, x, rois): + from roi_pooling import roi_pooling + x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0) + return x.view(x.size(0), -1) + + def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets): + cls_crit = nn.CrossEntropyLoss() + cls_loss = cls_crit(scores, labels) + + reg_crit = nn.SmoothL1Loss() + reg_loss = reg_crit(bbox_transform, bbox_targets) + + loss = cls_loss + reg_loss + return loss def frcnn_targets(self, all_rois, im, gt): - gt_boxes = gt['boxes'] - gt_labels = gt['gt_classes'] + all_rois = all_rois.data.numpy() + gt_boxes = gt['boxes'].numpy() + gt_labels = np.array(gt['gt_classes']) #zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) #all_rois = np.vstack( # (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) #) - all_rois = np.vstack(all_rois, gt_boxes) + all_rois = np.vstack((all_rois, gt_boxes)) zeros = np.zeros((all_rois.shape[0], 1), dtype=all_rois.dtype) all_rois = np.hstack((zeros, all_rois)) @@ -67,11 +91,18 @@ def frcnn_targets(self, all_rois, im, gt): # Sample rois with classification labels and bounding box regression # targets - labels, rois, bbox_targets = _sample_rois( + labels, rois, bbox_targets = _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, self._num_classes) - return all_rois, labels, rois, bbox_targets + return _tovar((all_rois, labels, rois, bbox_targets)) + + def bbox_reg(self, boxes, box_deltas, im): + boxes = boxes.data[:,1:].numpy() + box_deltas = box_deltas.data.numpy() + pred_boxes = bbox_transform_inv(boxes, box_deltas) + pred_boxes = clip_boxes(pred_boxes, im.size()[-2:]) + return _tovar(pred_boxes) def _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets (bbox_target_data) are stored in a @@ -110,7 +141,7 @@ def _compute_targets(ex_rois, gt_rois, labels): return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) -def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes): +def _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ @@ -118,6 +149,7 @@ def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_imag overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) + overlaps = overlaps.numpy() gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) #labels = gt_boxes[gt_assignment, 4] @@ -158,3 +190,14 @@ def _sample_rois(all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_imag _get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, bbox_targets + +def _tovar(x): + if isinstance(x, np.ndarray): + return Variable(torch.from_numpy(x), requires_grad=False) + elif torch.is_tensor(x): + return Variable(x, requires_grad=True) + elif isinstance(x, tuple): + t = [] + for i in x: + t.append(_tovar(i)) + return t diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py new file mode 100644 index 0000000000..3ea273531a --- /dev/null +++ b/fast_rcnn/main2.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +import torch.autograd as ag +import torch.utils.trainer as trainer +import torch.utils.data +import numpy as np + +from roi_pooling import roi_pooling +from voc import VOCDetection, TransformVOCDetectionAnnotation +from faster_rcnn import FasterRCNN + +cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') +class_to_ind = dict(zip(cls, range(len(cls)))) + + +train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + + +#train_loader = torch.utils.data.DataLoader( +# ds, batch_size=1, shuffle=True, num_workers=0) + +frcnn = FasterRCNN() + +frcnn.train() +#for i, (im, gt) in (enumerate(train_loader)): + +im, gt = train[0] +if True: + w, h = im.size + im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) + im = im.view(h, w, 3) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + im = im.transpose(0, 1).transpose(0, 2).contiguous() + im = im.float().div_(255) + im = im.unsqueeze(0) + + +loss, scores, boxes = frcnn((im, gt)) +from IPython import embed; embed() diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index 6e1f61d15d..94dafc5cd0 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -1,4 +1,4 @@ -#import torch +import torch import torch.nn as nn from torch.autograd import Variable import numpy as np @@ -61,7 +61,7 @@ def forward(self, im, feats, gt=None): # Variable(torch.from_numpy(rpn_labels)) def _rpn_classifier(self, x): - x = Variable(x, requires_grad=True) + #x = Variable(x, requires_grad=True) m1 = nn.Conv2d(3, 18, 3, 1, 1) m2 = nn.Conv2d(3, 36, 3, 1, 1) return m1(x), m2(x) From 22e769688662001e8addb077cf64bbbb687c6f00 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 19:06:33 -0200 Subject: [PATCH 08/18] updating --- fast_rcnn/faster_rcnn.py | 6 +++--- fast_rcnn/main2.py | 35 ++++++++++++++++---------------- fast_rcnn/roi_pooling.py | 44 ++++++++++++++++++++++++++++++++++++++-- fast_rcnn/rpn.py | 1 + 4 files changed, 63 insertions(+), 23 deletions(-) diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 30d21cc184..440cd5bd75 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -8,8 +8,8 @@ from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps m1 = nn.Conv2d(3, 3, 3, 16, 1) -m2 = nn.Linear(3*3*3, 21) -m3 = nn.Linear(3*3*3, 21*4) +m2 = nn.Linear(3*7*7, 21) +m3 = nn.Linear(3*7*7, 21*4) # should handle multiple scales, how? class FasterRCNN(nn.Container): @@ -60,7 +60,7 @@ def _classifier(self, x): return m2(x), m3(x) def _roi_pooling(self, x, rois): from roi_pooling import roi_pooling - x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0) + x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) return x.view(x.size(0), -1) def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets): diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py index 3ea273531a..a448e7dee6 100644 --- a/fast_rcnn/main2.py +++ b/fast_rcnn/main2.py @@ -4,10 +4,12 @@ import torch.utils.trainer as trainer import torch.utils.data import numpy as np +import torchvision.transforms as transforms from roi_pooling import roi_pooling from voc import VOCDetection, TransformVOCDetectionAnnotation from faster_rcnn import FasterRCNN +from tqdm import tqdm cls = ('__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', @@ -19,28 +21,25 @@ train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) +def collate_fn(batch): + imgs, gt = zip(*batch) + return imgs[0].unsqueeze(0), gt[0] -#train_loader = torch.utils.data.DataLoader( -# ds, batch_size=1, shuffle=True, num_workers=0) +train_loader = torch.utils.data.DataLoader( + train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) frcnn = FasterRCNN() frcnn.train() -#for i, (im, gt) in (enumerate(train_loader)): - -im, gt = train[0] -if True: - w, h = im.size - im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) - im = im.view(h, w, 3) - # put it from HWC to CHW format - # yikes, this transpose takes 80% of the loading time/CPU - im = im.transpose(0, 1).transpose(0, 2).contiguous() - im = im.float().div_(255) - im = im.unsqueeze(0) - - -loss, scores, boxes = frcnn((im, gt)) -from IPython import embed; embed() +for i, (im, gt) in tqdm(enumerate(train_loader)): + loss, scores, boxes = frcnn((im, gt)) + loss.backward() + +#im, gt = train[0] +#im = im.unsqueeze(0) + +#loss, scores, boxes = frcnn((im, gt)) +#from IPython import embed; embed() diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py index f56cbd6fd4..5b808a653f 100644 --- a/fast_rcnn/roi_pooling.py +++ b/fast_rcnn/roi_pooling.py @@ -3,9 +3,42 @@ import torch.autograd as ag import math +#import torch.nn.functions as F +from torch.autograd.function import Function +from torch._thnn import type2backend + + +class AdaptiveMaxPool2d(Function): + def __init__(self, out_w, out_h): + super(AdaptiveMaxPool2d, self).__init__() + self.out_w = out_w + self.out_h = out_h + + def forward(self, input): + output = input.new() + indices = input.new().long() + self.save_for_backward(input) + self.indices = indices + self._backend = type2backend[type(input)] + self._backend.SpatialAdaptiveMaxPooling_updateOutput( + self._backend.library_state, input, output, indices, + self.out_w, self.out_h) + return output + + def backward(self, grad_output): + input, = self.saved_tensors + indices = self.indices + grad_input = grad_output.new() + self._backend.SpatialAdaptiveMaxPooling_updateGradInput( + self._backend.library_state, input, grad_output, grad_input, + indices) + return grad_input, None + + + # approximation for the adaptive max pooling which is currently missing from nn # doesn't work if the input is smaller than size -def adaptive_max_pool(input, size): +def adaptive_max_pool_old(input, size): s = input.size()[2:] assert(s[0]>= size[0] and s[1] >= size[1]) ratio = [float(x)/y for x,y in zip(s, size)] @@ -16,6 +49,10 @@ def adaptive_max_pool(input, size): return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input) #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input) +def adaptive_max_pool(input, size): + #return F.thnn.AdaptiveMaxPool2d(size[0],size[1])(input) + return AdaptiveMaxPool2d(size[0],size[1])(input) + def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): assert(rois.dim() == 2) assert(rois.size(1) == 5) @@ -28,7 +65,7 @@ def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): for i in range(num_rois): roi = rois[i] im_idx = roi[0] - im = input.narrow(0, im_idx, 1)[..., roi[2]:roi[4], roi[1]:roi[3]] + im = input.narrow(0, im_idx, 1)[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)] output.append(adaptive_max_pool(im, size)) return torch.cat(output, 0) @@ -38,6 +75,9 @@ def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): rois = ag.Variable(torch.LongTensor([[0,1,2,7,8],[0,3,3,8,8]]),requires_grad=False) #rois = ag.Variable(torch.LongTensor([[0,3,3,8,8]]),requires_grad=False) + out = adaptive_max_pool(input,(3,3)) + out.backward(out.data.clone().uniform_()) + out = roi_pooling(input, rois, size=(3,3)) out.backward(out.data.clone().uniform_()) diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index 94dafc5cd0..d0e2cdfb26 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -105,6 +105,7 @@ def rpn_targets(self, all_anchors, im, gt): # keep only inside anchors anchors = all_anchors[inds_inside, :] + assert anchors.shape[0] > 0 # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside), ), dtype=np.float32) From 5e71e6cc304e79fbfab9875163bf7b67df8de3e0 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 19:37:35 -0200 Subject: [PATCH 09/18] A bit of organization --- fast_rcnn/bbox_transform.py | 1 - fast_rcnn/faster_rcnn.py | 29 ++++++++------------- fast_rcnn/main2.py | 51 ++++++++++++++++++++++++++++++++++--- fast_rcnn/roi_pooling.py | 18 ------------- fast_rcnn/rpn.py | 41 +++++++++++++++-------------- 5 files changed, 77 insertions(+), 63 deletions(-) diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py index c134cea059..e4c60ac223 100644 --- a/fast_rcnn/bbox_transform.py +++ b/fast_rcnn/bbox_transform.py @@ -88,7 +88,6 @@ def filter_boxes(boxes, min_size): def bbox_overlaps(a, bb): if isinstance(a, np.ndarray): a = torch.from_numpy(a) - if isinstance(bb, np.ndarray): bb = torch.from_numpy(bb) diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 440cd5bd75..019ca8abe0 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -3,24 +3,25 @@ from torch.autograd import Variable import numpy as np import numpy.random as npr -from rpn import RPN from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps -m1 = nn.Conv2d(3, 3, 3, 16, 1) -m2 = nn.Linear(3*7*7, 21) -m3 = nn.Linear(3*7*7, 21*4) # should handle multiple scales, how? class FasterRCNN(nn.Container): - def __init__(self): + def __init__(self, features, pooler, classifier, rpn): super(FasterRCNN, self).__init__() + self.features = features + self.roi_pooling = pooler + self.classifier = classifier + self.rpn = rpn + self.batch_size = 128 self.fg_fraction = 0.25 self.fg_threshold = 0.5 self.bg_threshold = (0, 0.5) self._num_classes = 21 - self.rpn = RPN() + # should it support batched images ? def forward(self, x): @@ -30,7 +31,7 @@ def forward(self, x): else: im = x - feats = self._features(_tovar(im)) + feats = self.features(_tovar(im)) roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt) @@ -40,8 +41,8 @@ def forward(self, x): all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) # r-cnn - regions = self._roi_pooling(feats, roi_boxes) - scores, bbox_transform = self._classifier(regions) + regions = self.roi_pooling(feats, roi_boxes) + scores, bbox_transform = self.classifier(regions) boxes = self.bbox_reg(roi_boxes, bbox_transform, im) @@ -53,16 +54,6 @@ def forward(self, x): return scores, boxes - # the user define their model in here - def _features(self, x): - return m1(x) - def _classifier(self, x): - return m2(x), m3(x) - def _roi_pooling(self, x, rois): - from roi_pooling import roi_pooling - x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) - return x.view(x.size(0), -1) - def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets): cls_crit = nn.CrossEntropyLoss() cls_loss = cls_crit(scores, labels) diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py index a448e7dee6..5c791807f2 100644 --- a/fast_rcnn/main2.py +++ b/fast_rcnn/main2.py @@ -5,6 +5,8 @@ import torch.utils.data import numpy as np import torchvision.transforms as transforms +from rpn import RPN +import torch.optim as optim from roi_pooling import roi_pooling from voc import VOCDetection, TransformVOCDetectionAnnotation @@ -31,12 +33,53 @@ def collate_fn(batch): train_loader = torch.utils.data.DataLoader( train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) -frcnn = FasterRCNN() +class Features(nn.Container): + def __init__(self): + super(Features, self).__init__() + self.m = nn.Conv2d(3, 3, 3, 16, 1) + + def forward(self, x): + return self.m(x) + +class Classifier(nn.Container): + def __init__(self): + super(Classifier, self).__init__() + self.m1 = nn.Linear(3*7*7, 21) + self.m2 = nn.Linear(3*7*7, 21*4) + + def forward(self, x): + return self.m1(x), self.m2(x) + +def pooler(x, rois): + from roi_pooling import roi_pooling + x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) + return x.view(x.size(0), -1) + +class RPNClassifier(nn.Container): + def __init__(self, n): + super(RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + + def forward(self, x): + return self.m1(x), self.m2(x) + +rpn = RPN(RPNClassifier(3)) + +frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn) frcnn.train() -for i, (im, gt) in tqdm(enumerate(train_loader)): - loss, scores, boxes = frcnn((im, gt)) - loss.backward() + +optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9) + + +from IPython import embed; embed() + +#for i, (im, gt) in tqdm(enumerate(train_loader)): +# optimizer.zero_grad() +# loss, scores, boxes = frcnn((im, gt)) +# loss.backward() +# optimizer.step() #im, gt = train[0] #im = im.unsqueeze(0) diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py index 5b808a653f..885e3a734a 100644 --- a/fast_rcnn/roi_pooling.py +++ b/fast_rcnn/roi_pooling.py @@ -3,11 +3,9 @@ import torch.autograd as ag import math -#import torch.nn.functions as F from torch.autograd.function import Function from torch._thnn import type2backend - class AdaptiveMaxPool2d(Function): def __init__(self, out_w, out_h): super(AdaptiveMaxPool2d, self).__init__() @@ -34,23 +32,7 @@ def backward(self, grad_output): indices) return grad_input, None - - -# approximation for the adaptive max pooling which is currently missing from nn -# doesn't work if the input is smaller than size -def adaptive_max_pool_old(input, size): - s = input.size()[2:] - assert(s[0]>= size[0] and s[1] >= size[1]) - ratio = [float(x)/y for x,y in zip(s, size)] - kernel_size = [int(math.ceil(x)) for x in ratio] - stride = kernel_size - remainder = [x*y-z for x, y, z in zip(kernel_size, size, s)] - padding = [int(math.floor((x+1)/2)) for x in remainder] - return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=True)(input) - #return nn.MaxPool2d(kernel_size,stride,padding=padding, ceil_mode=False)(input) - def adaptive_max_pool(input, size): - #return F.thnn.AdaptiveMaxPool2d(size[0],size[1])(input) return AdaptiveMaxPool2d(size[0],size[1])(input) def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index d0e2cdfb26..788c02e73a 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -12,9 +12,11 @@ class RPN(nn.Container): - def __init__(self): + def __init__(self, classifier): super(RPN, self).__init__() + self.rpn_classifier = classifier + anchor_scales = (8, 16, 32) self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] @@ -38,7 +40,7 @@ def forward(self, im, feats, gt=None): self._feat_stride = round(im.size(3)/feats.size(3)) # rpn # put in a separate function - rpn_map, rpn_bbox_transform = self._rpn_classifier(feats) + rpn_map, rpn_bbox_transform = self.rpn_classifier(feats) all_anchors = self.rpn_get_anchors(feats) rpn_loss = None if self.training is True: @@ -60,13 +62,6 @@ def forward(self, im, feats, gt=None): #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \ # Variable(torch.from_numpy(rpn_labels)) - def _rpn_classifier(self, x): - #x = Variable(x, requires_grad=True) - m1 = nn.Conv2d(3, 18, 3, 1, 1) - m2 = nn.Conv2d(3, 36, 3, 1, 1) - return m1(x), m2(x) - #pass - # from faster rcnn py def rpn_get_anchors(self, im): height, width = im.size()[-2:] @@ -264,7 +259,19 @@ def show(img, boxes, label): if __name__ == '__main__': import torch from voc import VOCDetection, TransformVOCDetectionAnnotation - rpn = RPN() + import torchvision.transforms as transforms + + class RPNClassifier(nn.Container): + def __init__(self, n): + super(RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + + def forward(self, x): + return self.m1(x), self.m2(x) + + + rpn = RPN(RPNClassifier(3)) cls = ('__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', @@ -275,23 +282,15 @@ def show(img, boxes, label): train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) im, gt = train[11] im0 = im - if True: - w, h = im.size - im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) - im = im.view(h, w, 3) - # put it from HWC to CHW format - # yikes, this transpose takes 80% of the loading time/CPU - im = im.transpose(0, 1).transpose(0, 2).contiguous() - im = im.float().div_(255).unsqueeze(0) - - + im = im.unsqueeze(0) - feats = torch.rand(1,3,im.size(2)/16, im.size(3)/16) + feats = Variable(torch.rand(1,3,im.size(2)/16, im.size(3)/16)) print(feats.size()) print(im.size()) From 9e65a2f15aeeb392d2385c0d88b927ad73d728df Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 20:18:58 -0200 Subject: [PATCH 10/18] Organization --- fast_rcnn/bbox_transform.py | 14 ++++++++++- fast_rcnn/faster_rcnn.py | 46 ++++++++++++++++++------------------- fast_rcnn/main2.py | 12 +++++----- fast_rcnn/rpn.py | 40 ++++++++++++++++++-------------- 4 files changed, 65 insertions(+), 47 deletions(-) diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/bbox_transform.py index e4c60ac223..a443cd1153 100644 --- a/fast_rcnn/bbox_transform.py +++ b/fast_rcnn/bbox_transform.py @@ -6,6 +6,7 @@ # -------------------------------------------------------- import torch +from torch.autograd import Variable import numpy as np def bbox_transform(ex_rois, gt_rois): @@ -120,4 +121,15 @@ def bbox_overlaps(a, bb): return torch.cat([o.view(-1,1) for o in oo],1) - +def to_var(x): + if isinstance(x, np.ndarray): + return Variable(torch.from_numpy(x), requires_grad=False) + elif torch.is_tensor(x): + return Variable(x, requires_grad=True) + elif isinstance(x, tuple): + t = [] + for i in x: + t.append(to_var(i)) + return t + elif isinstance(x, Variable): + return x diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 019ca8abe0..f0e07455a2 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -4,30 +4,39 @@ import numpy as np import numpy.random as npr -from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps +from bbox_transform import \ + bbox_transform, bbox_transform_inv, clip_boxes, bbox_overlaps + +from bbox_transform import to_var as _tovar # should handle multiple scales, how? class FasterRCNN(nn.Container): - def __init__(self, features, pooler, classifier, rpn): + def __init__(self, + features, pooler, + classifier, rpn, + batch_size=128, fg_fraction=0.25, + fg_threshold=0.5, bg_threshold=None, + num_classes=21): super(FasterRCNN, self).__init__() self.features = features self.roi_pooling = pooler self.classifier = classifier self.rpn = rpn - self.batch_size = 128 - self.fg_fraction = 0.25 - self.fg_threshold = 0.5 - self.bg_threshold = (0, 0.5) - self._num_classes = 21 + self.batch_size = batch_size + self.fg_fraction = fg_fraction + self.fg_threshold = fg_threshold + if bg_threshold is None: + bg_threshold = (0, 0.5) + self.bg_threshold = bg_threshold + self._num_classes = num_classes # should it support batched images ? def forward(self, x): if self.training is True: im, gt = x - # call model.train() here ? else: im = x @@ -42,24 +51,24 @@ def forward(self, x): # r-cnn regions = self.roi_pooling(feats, roi_boxes) - scores, bbox_transform = self.classifier(regions) + scores, bbox_pred = self.classifier(regions) - boxes = self.bbox_reg(roi_boxes, bbox_transform, im) + boxes = self.bbox_reg(roi_boxes, bbox_pred, im) # apply cls + bbox reg loss here if self.training is True: - frcnn_loss = self.frcnn_loss(scores, bbox_transform, frcnn_labels, frcnn_bbox_targets) + frcnn_loss = self.frcnn_loss(scores, bbox_pred, frcnn_labels, frcnn_bbox_targets) loss = frcnn_loss + rpn_loss return loss, scores, boxes return scores, boxes - def frcnn_loss(self, scores, bbox_transform, labels, bbox_targets): + def frcnn_loss(self, scores, bbox_pred, labels, bbox_targets): cls_crit = nn.CrossEntropyLoss() cls_loss = cls_crit(scores, labels) reg_crit = nn.SmoothL1Loss() - reg_loss = reg_crit(bbox_transform, bbox_targets) + reg_loss = reg_crit(bbox_pred, bbox_targets) loss = cls_loss + reg_loss return loss @@ -182,13 +191,4 @@ def _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_pe return labels, rois, bbox_targets -def _tovar(x): - if isinstance(x, np.ndarray): - return Variable(torch.from_numpy(x), requires_grad=False) - elif torch.is_tensor(x): - return Variable(x, requires_grad=True) - elif isinstance(x, tuple): - t = [] - for i in x: - t.append(_tovar(i)) - return t + diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py index 5c791807f2..604c07e263 100644 --- a/fast_rcnn/main2.py +++ b/fast_rcnn/main2.py @@ -73,13 +73,13 @@ def forward(self, x): optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9) -from IPython import embed; embed() +#from IPython import embed; embed() -#for i, (im, gt) in tqdm(enumerate(train_loader)): -# optimizer.zero_grad() -# loss, scores, boxes = frcnn((im, gt)) -# loss.backward() -# optimizer.step() +for i, (im, gt) in tqdm(enumerate(train_loader)): + optimizer.zero_grad() + loss, scores, boxes = frcnn((im, gt)) + loss.backward() + optimizer.step() #im, gt = train[0] #im = im.unsqueeze(0) diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index 788c02e73a..bbbe2f6096 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -8,29 +8,38 @@ from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps from generate_anchors import generate_anchors +from bbox_transform import to_var as _tovar + from py_cpu_nms import py_cpu_nms as nms class RPN(nn.Container): - def __init__(self, classifier): + def __init__(self, + classifier, anchor_scales=None, + negative_overlap=0.3, positive_overlap=0.7, + fg_fraction=0.5, batch_size=256, + nms_thresh=0.7, min_size=16, + pre_nms_topN=12000, post_nms_topN=2000 + ): super(RPN, self).__init__() self.rpn_classifier = classifier - anchor_scales = (8, 16, 32) + if anchor_scales is None: + anchor_scales = (8, 16, 32) self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] - self.negative_overlap = 0.3 - self.positive_overlap = 0.7 - self.fg_fraction = 0.5 - self.batch_size = 256 + self.negative_overlap = negative_overlap + self.positive_overlap = positive_overlap + self.fg_fraction = fg_fraction + self.batch_size = batch_size # used for both train and test - self.nms_thresh = 0.7 - self.pre_nms_topN = 12000 - self.post_nms_topN = 2000 - self.min_size = 16 + self.nms_thresh = nms_thresh + self.pre_nms_topN = pre_nms_topN + self.post_nms_topN = post_nms_topN + self.min_size = min_size # output rpn probs as well @@ -40,27 +49,24 @@ def forward(self, im, feats, gt=None): self._feat_stride = round(im.size(3)/feats.size(3)) # rpn # put in a separate function - rpn_map, rpn_bbox_transform = self.rpn_classifier(feats) + rpn_map, rpn_bbox_pred = self.rpn_classifier(feats) all_anchors = self.rpn_get_anchors(feats) rpn_loss = None if self.training is True: assert gt is not None rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) # need to subsample boxes here - rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets) + rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_pred, rpn_labels, rpn_bbox_targets) # roi proposal # clip, sort, pre nms topk, nms, after nms topk # params are different for train and test # proposal_layer.py - roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_transform, im) + roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_pred, im) # only for visualization #roi_boxes = all_anchors - #return roi_boxes, scores, rpn_loss - return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss - #return Variable(torch.from_numpy(roi_boxes),requires_grad=False), Variable(torch.from_numpy(scores),requires_grad=False), rpn_loss, \ - # Variable(torch.from_numpy(rpn_labels)) + return _tovar((roi_boxes, scores, rpn_loss)) # from faster rcnn py def rpn_get_anchors(self, im): From e1196727132a5086f17ed868efa3ade2853b6e12 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 20:19:36 -0200 Subject: [PATCH 11/18] Rename --- fast_rcnn/main.py | 285 +++++++----------------------------------- fast_rcnn/main2.py | 88 ------------- fast_rcnn/main_old.py | 277 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 325 insertions(+), 325 deletions(-) delete mode 100644 fast_rcnn/main2.py create mode 100644 fast_rcnn/main_old.py diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index 0cb7795536..604c07e263 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -4,13 +4,15 @@ import torch.utils.trainer as trainer import torch.utils.data import numpy as np +import torchvision.transforms as transforms +from rpn import RPN +import torch.optim as optim from roi_pooling import roi_pooling from voc import VOCDetection, TransformVOCDetectionAnnotation - +from faster_rcnn import FasterRCNN from tqdm import tqdm - cls = ('__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', @@ -21,257 +23,66 @@ train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) +def collate_fn(batch): + imgs, gt = zip(*batch) + return imgs[0].unsqueeze(0), gt[0] -# TODO -# add class information in dataset -# separate in different files -# remove hard-coding 21 from Sampler -# cache the sampled boxes ? - -# image flip goes to the dataset class, not BoxSampler - -def bbox_overlaps(a, bb): - #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b - oo = [] - - for b in bb: - - x1 = a.select(1,0).clone() - x1[x1.lt(b[0])] = b[0] - y1 = a.select(1,1).clone() - y1[y1.lt(b[1])] = b[1] - x2 = a.select(1,2).clone() - x2[x2.gt(b[2])] = b[2] - y2 = a.select(1,3).clone() - y2[y2.gt(b[3])] = b[3] - - w = x2-x1+1 - h = y2-y1+1 - inter = torch.mul(w,h).float() - aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() - barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) - - # intersection over union overlap - o = torch.div(inter , (aarea+barea-inter)) - # set invalid entries to 0 overlap - o[w.lt(0)] = 0 - o[h.lt(0)] = 0 - - oo += [o] - - return torch.cat([o.view(-1,1) for o in oo],1) - -class BoxGenerator(object): - def __init__(self, num_boxes=2000): - super(BoxGenerator, self).__init__() - self.num_boxes = num_boxes - - def __call__(self, im): - #h, w = im.size()[1:] - w, h = im.size - x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) - y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) - - x = x[0] - y = y[0] - - return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) - - -class BoxSampler(torch.utils.data.Dataset): - - def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), - generate_boxes=BoxGenerator(num_boxes=10000)): - super(BoxSampler, self).__init__() - self.dataset = dataset - self.fg_threshold = fg_threshold - self.bg_threshold = bg_threshold - self.generate_boxes = generate_boxes - - def _overlap_and_attribute(self, boxes, gt_roidb): - - #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32) - overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32) - - if gt_roidb is not None and gt_roidb['boxes'].size > 0: - gt_boxes = gt_roidb['boxes'] - gt_classes = np.array(gt_roidb['gt_classes']) - gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy() - argmaxes = gt_overlaps.argmax(axis=1) - maxes = gt_overlaps.max(axis=1) - - # remove low scoring - pos = maxes >= self.fg_threshold - neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1]) - maxes[neg] = 0 - - I = np.where(maxes > 0)[0] - overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] - - overlaps = overlaps[pos | neg] - boxes = boxes.numpy() - boxes = boxes[pos | neg] - return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1)) - - def __getitem__(self, idx): - im, gt = self.dataset[idx] - boxes = self.generate_boxes(im) - boxes, labels = self._overlap_and_attribute(boxes, gt) - - if True: - w, h = im.size - im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) - im = im.view(h, w, 3) - # put it from HWC to CHW format - # yikes, this transpose takes 80% of the loading time/CPU - im = im.transpose(0, 1).transpose(0, 2).contiguous() - im = im.float().div_(255) - - return im, boxes, labels - - def __len__(self): - return len(self.dataset) - - -class BoxSelector(torch.utils.data.Dataset): - def __init__(self, dataset, num_boxes=128, fg_fraction=0.25): - super(BoxSelector, self).__init__() - self.dataset = dataset - self.num_boxes = num_boxes - self.fg_fraction = fg_fraction - - def __len__(self): - return len(self.dataset) - - def __getitem__(self, idx): - im, boxes, labels = self.dataset[idx] - - boxes = boxes.numpy() - labels = labels.numpy() - - bg = np.where(labels == 0)[0] - fg = np.where(labels != 0)[0] - nfg = min(len(fg), self.num_boxes*self.fg_fraction) - nbg = min(len(bg), self.num_boxes - nfg) +train_loader = torch.utils.data.DataLoader( + train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) - bg = bg[np.random.permutation(len(bg))[:nbg]] - fg = fg[np.random.permutation(len(fg))[:nfg]] +class Features(nn.Container): + def __init__(self): + super(Features, self).__init__() + self.m = nn.Conv2d(3, 3, 3, 16, 1) - I = np.concatenate([fg, bg], axis=0) + def forward(self, x): + return self.m(x) - return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I]) +class Classifier(nn.Container): + def __init__(self): + super(Classifier, self).__init__() + self.m1 = nn.Linear(3*7*7, 21) + self.m2 = nn.Linear(3*7*7, 21*4) + def forward(self, x): + return self.m1(x), self.m2(x) -class ToPILImage(object): - """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W - or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C - to a PIL.Image of range [0, 255] - """ - def __call__(self, pic): - from PIL import Image, ImageOps - if isinstance(pic, np.ndarray): - # handle numpy array - img = Image.fromarray(pic) - else: - npimg = pic.mul(255).byte().numpy() - npimg = np.transpose(npimg, (1,2,0)) - img = Image.fromarray(npimg) - return img +def pooler(x, rois): + from roi_pooling import roi_pooling + x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) + return x.view(x.size(0), -1) -def make_grid(tensor, nrow=8, padding=2): - import math - """ - Given a 4D mini-batch Tensor of shape (B x C x H x W), - or a list of images all of the same size, - makes a grid of images - """ - tensorlist = None - if isinstance(tensor, list): - tensorlist = tensor - numImages = len(tensorlist) - size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size()) - tensor = tensorlist[0].new(size) - for i in range(numImages): - tensor[i].copy_(tensorlist[i]) - if tensor.dim() == 2: # single image H x W - tensor = tensor.view(1, tensor.size(0), tensor.size(1)) - if tensor.dim() == 3: # single image - if tensor.size(0) == 1: - tensor = torch.cat((tensor, tensor, tensor), 0) - return tensor - if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images - tensor = torch.cat((tensor, tensor, tensor), 1) - # make the mini-batch of images into a grid - nmaps = tensor.size(0) - xmaps = min(nrow, nmaps) - ymaps = int(math.ceil(nmaps / xmaps)) - height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) - grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max()) - k = 0 - for y in range(ymaps): - for x in range(xmaps): - if k >= nmaps: - break - grid.narrow(1, y*height+1+padding//2,height-padding)\ - .narrow(2, x*width+1+padding//2, width-padding)\ - .copy_(tensor[k]) - k = k + 1 - return grid +class RPNClassifier(nn.Container): + def __init__(self, n): + super(RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + def forward(self, x): + return self.m1(x), self.m2(x) +rpn = RPN(RPNClassifier(3)) -ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25) +frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn) -def collate_fn(batch): - imgs, boxes, labels = zip(*batch) - max_size = [max(size) for size in zip(*[im.size() for im in imgs])] - new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0) - for im, im2 in zip(new_imgs, imgs): - im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2) - boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) - boxes = torch.from_numpy(boxes) - labels = torch.cat(labels, 0) - return new_imgs, boxes, labels - -train_loader = torch.utils.data.DataLoader( - ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn) +frcnn.train() +optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9) -def show(img, boxes, label, cls=None): - from PIL import Image, ImageDraw - #img, target = self.__getitem__(index) - if cls is None: - cls = ('__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') - draw = ImageDraw.Draw(img) - for obj, t in zip(boxes, label): - if t > 0: - draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) - draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) - else: - #pass - draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) - img.show() +#from IPython import embed; embed() +for i, (im, gt) in tqdm(enumerate(train_loader)): + optimizer.zero_grad() + loss, scores, boxes = frcnn((im, gt)) + loss.backward() + optimizer.step() -for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): - #grid = make_grid(img, 2, 1) - #grid = ToPILImage()(grid) - #grid.show() - #break - pass - #print('====') - #print(i) - #print(img.size()) - #print(boxes.size()) - #print(labels.size()) +#im, gt = train[0] +#im = im.unsqueeze(0) -#im, box, label = ds[10] -#show(im,box,label) +#loss, scores, boxes = frcnn((im, gt)) +#from IPython import embed; embed() diff --git a/fast_rcnn/main2.py b/fast_rcnn/main2.py deleted file mode 100644 index 604c07e263..0000000000 --- a/fast_rcnn/main2.py +++ /dev/null @@ -1,88 +0,0 @@ -import torch -import torch.nn as nn -import torch.autograd as ag -import torch.utils.trainer as trainer -import torch.utils.data -import numpy as np -import torchvision.transforms as transforms -from rpn import RPN -import torch.optim as optim - -from roi_pooling import roi_pooling -from voc import VOCDetection, TransformVOCDetectionAnnotation -from faster_rcnn import FasterRCNN -from tqdm import tqdm - -cls = ('__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') -class_to_ind = dict(zip(cls, range(len(cls)))) - - -train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', - transform=transforms.ToTensor(), - target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) - -def collate_fn(batch): - imgs, gt = zip(*batch) - return imgs[0].unsqueeze(0), gt[0] - -train_loader = torch.utils.data.DataLoader( - train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) - -class Features(nn.Container): - def __init__(self): - super(Features, self).__init__() - self.m = nn.Conv2d(3, 3, 3, 16, 1) - - def forward(self, x): - return self.m(x) - -class Classifier(nn.Container): - def __init__(self): - super(Classifier, self).__init__() - self.m1 = nn.Linear(3*7*7, 21) - self.m2 = nn.Linear(3*7*7, 21*4) - - def forward(self, x): - return self.m1(x), self.m2(x) - -def pooler(x, rois): - from roi_pooling import roi_pooling - x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) - return x.view(x.size(0), -1) - -class RPNClassifier(nn.Container): - def __init__(self, n): - super(RPNClassifier, self).__init__() - self.m1 = nn.Conv2d(n, 18, 3, 1, 1) - self.m2 = nn.Conv2d(n, 36, 3, 1, 1) - - def forward(self, x): - return self.m1(x), self.m2(x) - -rpn = RPN(RPNClassifier(3)) - -frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn) - -frcnn.train() - -optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9) - - -#from IPython import embed; embed() - -for i, (im, gt) in tqdm(enumerate(train_loader)): - optimizer.zero_grad() - loss, scores, boxes = frcnn((im, gt)) - loss.backward() - optimizer.step() - -#im, gt = train[0] -#im = im.unsqueeze(0) - -#loss, scores, boxes = frcnn((im, gt)) -#from IPython import embed; embed() diff --git a/fast_rcnn/main_old.py b/fast_rcnn/main_old.py new file mode 100644 index 0000000000..0cb7795536 --- /dev/null +++ b/fast_rcnn/main_old.py @@ -0,0 +1,277 @@ +import torch +import torch.nn as nn +import torch.autograd as ag +import torch.utils.trainer as trainer +import torch.utils.data +import numpy as np + +from roi_pooling import roi_pooling +from voc import VOCDetection, TransformVOCDetectionAnnotation + +from tqdm import tqdm + + +cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') +class_to_ind = dict(zip(cls, range(len(cls)))) + + +train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + + +# TODO +# add class information in dataset +# separate in different files +# remove hard-coding 21 from Sampler +# cache the sampled boxes ? + +# image flip goes to the dataset class, not BoxSampler + +def bbox_overlaps(a, bb): + #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b + oo = [] + + for b in bb: + + x1 = a.select(1,0).clone() + x1[x1.lt(b[0])] = b[0] + y1 = a.select(1,1).clone() + y1[y1.lt(b[1])] = b[1] + x2 = a.select(1,2).clone() + x2[x2.gt(b[2])] = b[2] + y2 = a.select(1,3).clone() + y2[y2.gt(b[3])] = b[3] + + w = x2-x1+1 + h = y2-y1+1 + inter = torch.mul(w,h).float() + aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() + barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) + + # intersection over union overlap + o = torch.div(inter , (aarea+barea-inter)) + # set invalid entries to 0 overlap + o[w.lt(0)] = 0 + o[h.lt(0)] = 0 + + oo += [o] + + return torch.cat([o.view(-1,1) for o in oo],1) + +class BoxGenerator(object): + def __init__(self, num_boxes=2000): + super(BoxGenerator, self).__init__() + self.num_boxes = num_boxes + + def __call__(self, im): + #h, w = im.size()[1:] + w, h = im.size + x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) + y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) + + x = x[0] + y = y[0] + + return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) + + +class BoxSampler(torch.utils.data.Dataset): + + def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), + generate_boxes=BoxGenerator(num_boxes=10000)): + super(BoxSampler, self).__init__() + self.dataset = dataset + self.fg_threshold = fg_threshold + self.bg_threshold = bg_threshold + self.generate_boxes = generate_boxes + + def _overlap_and_attribute(self, boxes, gt_roidb): + + #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32) + overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32) + + if gt_roidb is not None and gt_roidb['boxes'].size > 0: + gt_boxes = gt_roidb['boxes'] + gt_classes = np.array(gt_roidb['gt_classes']) + gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy() + argmaxes = gt_overlaps.argmax(axis=1) + maxes = gt_overlaps.max(axis=1) + + # remove low scoring + pos = maxes >= self.fg_threshold + neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1]) + maxes[neg] = 0 + + I = np.where(maxes > 0)[0] + overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] + + overlaps = overlaps[pos | neg] + boxes = boxes.numpy() + boxes = boxes[pos | neg] + return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1)) + + def __getitem__(self, idx): + im, gt = self.dataset[idx] + boxes = self.generate_boxes(im) + boxes, labels = self._overlap_and_attribute(boxes, gt) + + if True: + w, h = im.size + im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) + im = im.view(h, w, 3) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + im = im.transpose(0, 1).transpose(0, 2).contiguous() + im = im.float().div_(255) + + return im, boxes, labels + + def __len__(self): + return len(self.dataset) + + +class BoxSelector(torch.utils.data.Dataset): + def __init__(self, dataset, num_boxes=128, fg_fraction=0.25): + super(BoxSelector, self).__init__() + self.dataset = dataset + self.num_boxes = num_boxes + self.fg_fraction = fg_fraction + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + im, boxes, labels = self.dataset[idx] + + boxes = boxes.numpy() + labels = labels.numpy() + + bg = np.where(labels == 0)[0] + fg = np.where(labels != 0)[0] + nfg = min(len(fg), self.num_boxes*self.fg_fraction) + nbg = min(len(bg), self.num_boxes - nfg) + + bg = bg[np.random.permutation(len(bg))[:nbg]] + fg = fg[np.random.permutation(len(fg))[:nfg]] + + I = np.concatenate([fg, bg], axis=0) + + return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I]) + + +class ToPILImage(object): + """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W + or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C + to a PIL.Image of range [0, 255] + """ + def __call__(self, pic): + from PIL import Image, ImageOps + if isinstance(pic, np.ndarray): + # handle numpy array + img = Image.fromarray(pic) + else: + npimg = pic.mul(255).byte().numpy() + npimg = np.transpose(npimg, (1,2,0)) + img = Image.fromarray(npimg) + return img + +def make_grid(tensor, nrow=8, padding=2): + import math + """ + Given a 4D mini-batch Tensor of shape (B x C x H x W), + or a list of images all of the same size, + makes a grid of images + """ + tensorlist = None + if isinstance(tensor, list): + tensorlist = tensor + numImages = len(tensorlist) + size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size()) + tensor = tensorlist[0].new(size) + for i in range(numImages): + tensor[i].copy_(tensorlist[i]) + if tensor.dim() == 2: # single image H x W + tensor = tensor.view(1, tensor.size(0), tensor.size(1)) + if tensor.dim() == 3: # single image + if tensor.size(0) == 1: + tensor = torch.cat((tensor, tensor, tensor), 0) + return tensor + if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images + tensor = torch.cat((tensor, tensor, tensor), 1) + # make the mini-batch of images into a grid + nmaps = tensor.size(0) + xmaps = min(nrow, nmaps) + ymaps = int(math.ceil(nmaps / xmaps)) + height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) + grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max()) + k = 0 + for y in range(ymaps): + for x in range(xmaps): + if k >= nmaps: + break + grid.narrow(1, y*height+1+padding//2,height-padding)\ + .narrow(2, x*width+1+padding//2, width-padding)\ + .copy_(tensor[k]) + k = k + 1 + return grid + + + +ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25) + +def collate_fn(batch): + imgs, boxes, labels = zip(*batch) + max_size = [max(size) for size in zip(*[im.size() for im in imgs])] + new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0) + for im, im2 in zip(new_imgs, imgs): + im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2) + boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) + boxes = torch.from_numpy(boxes) + labels = torch.cat(labels, 0) + return new_imgs, boxes, labels + +train_loader = torch.utils.data.DataLoader( + ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn) + + +def show(img, boxes, label, cls=None): + from PIL import Image, ImageDraw + #img, target = self.__getitem__(index) + if cls is None: + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + + draw = ImageDraw.Draw(img) + for obj, t in zip(boxes, label): + if t > 0: + draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) + draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) + else: + #pass + draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + img.show() + + +for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): + #grid = make_grid(img, 2, 1) + #grid = ToPILImage()(grid) + #grid.show() + #break + pass + #print('====') + #print(i) + #print(img.size()) + #print(boxes.size()) + #print(labels.size()) + +#im, box, label = ds[10] +#show(im,box,label) From 4058094721e3374b0c94ebac8f22d4fcc90f089b Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 21:14:14 -0200 Subject: [PATCH 12/18] rename --- fast_rcnn/{model.py => model_old.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fast_rcnn/{model.py => model_old.py} (100%) diff --git a/fast_rcnn/model.py b/fast_rcnn/model_old.py similarity index 100% rename from fast_rcnn/model.py rename to fast_rcnn/model_old.py From a0061e8acfc086fbd87221c47ab1aa72caf57668 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 21:14:45 -0200 Subject: [PATCH 13/18] Cleaning up a bit --- fast_rcnn/README.md | 5 ++ fast_rcnn/faster_rcnn.py | 3 +- fast_rcnn/main.py | 161 +++++++++++++++++++++++---------------- fast_rcnn/model.py | 50 ++++++++++++ 4 files changed, 153 insertions(+), 66 deletions(-) create mode 100644 fast_rcnn/README.md create mode 100644 fast_rcnn/model.py diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md new file mode 100644 index 0000000000..4389746ab3 --- /dev/null +++ b/fast_rcnn/README.md @@ -0,0 +1,5 @@ +# Faster R-CNN code example + +```python +python main.py PATH_TO_DATASET +``` diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index f0e07455a2..86379f775a 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -21,8 +21,8 @@ def __init__(self, super(FasterRCNN, self).__init__() self.features = features self.roi_pooling = pooler - self.classifier = classifier self.rpn = rpn + self.classifier = classifier self.batch_size = batch_size self.fg_fraction = fg_fraction @@ -32,7 +32,6 @@ def __init__(self, self.bg_threshold = bg_threshold self._num_classes = num_classes - # should it support batched images ? def forward(self, x): if self.training is True: diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index 604c07e263..2e993c1e85 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -1,28 +1,48 @@ +import argparse +import time + import torch import torch.nn as nn -import torch.autograd as ag -import torch.utils.trainer as trainer import torch.utils.data -import numpy as np import torchvision.transforms as transforms -from rpn import RPN + import torch.optim as optim -from roi_pooling import roi_pooling from voc import VOCDetection, TransformVOCDetectionAnnotation -from faster_rcnn import FasterRCNN + +import importlib + +#from model import model + from tqdm import tqdm +parser = argparse.ArgumentParser(description='PyTorch Faster R-CNN Training') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--model', '-m', metavar='MODEL', default='model', + help='file containing model definition ' + '(default: model)') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate') +parser.add_argument('--momentum', default=0.01, type=float, metavar='M', + help='momentum') +parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)') +parser.add_argument('--print-freq', '-p', default=10, type=int, + metavar='N', help='print frequency (default: 10)') + cls = ('__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') class_to_ind = dict(zip(cls, range(len(cls)))) +args = parser.parse_args() +model = importlib.import_module(args.model).model -train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', +train = VOCDetection(args.data, 'train', transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) @@ -31,58 +51,71 @@ def collate_fn(batch): return imgs[0].unsqueeze(0), gt[0] train_loader = torch.utils.data.DataLoader( - train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) - -class Features(nn.Container): - def __init__(self): - super(Features, self).__init__() - self.m = nn.Conv2d(3, 3, 3, 16, 1) - - def forward(self, x): - return self.m(x) - -class Classifier(nn.Container): - def __init__(self): - super(Classifier, self).__init__() - self.m1 = nn.Linear(3*7*7, 21) - self.m2 = nn.Linear(3*7*7, 21*4) - - def forward(self, x): - return self.m1(x), self.m2(x) - -def pooler(x, rois): - from roi_pooling import roi_pooling - x = roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) - return x.view(x.size(0), -1) - -class RPNClassifier(nn.Container): - def __init__(self, n): - super(RPNClassifier, self).__init__() - self.m1 = nn.Conv2d(n, 18, 3, 1, 1) - self.m2 = nn.Conv2d(n, 36, 3, 1, 1) - - def forward(self, x): - return self.m1(x), self.m2(x) - -rpn = RPN(RPNClassifier(3)) - -frcnn = FasterRCNN(Features(), pooler, Classifier(), rpn) - -frcnn.train() - -optimizer = optim.SGD(frcnn.parameters(), lr = 0.01, momentum=0.9) - + train, batch_size=1, shuffle=True, + num_workers=0, collate_fn=collate_fn) + + + +optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + +def train(train_loader, model, optimizer, epoch): + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + + model.train() + end = time.time() + for i, (im, gt) in (enumerate(train_loader)): + # measure data loading time + data_time.update(time.time() - end) + + optimizer.zero_grad() + loss, scores, boxes = model((im, gt)) + loss.backward() + optimizer.step() + + losses.update(loss.data[0], im.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + if i % args.print_freq == 0: + print('Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' + #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})' + .format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, + #top1=top1, top5=top5 + )) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + +for epoch in range(0, 10): + train(train_loader, model, optimizer, epoch) #from IPython import embed; embed() -for i, (im, gt) in tqdm(enumerate(train_loader)): - optimizer.zero_grad() - loss, scores, boxes = frcnn((im, gt)) - loss.backward() - optimizer.step() - -#im, gt = train[0] -#im = im.unsqueeze(0) - -#loss, scores, boxes = frcnn((im, gt)) -#from IPython import embed; embed() +#if __name__ == '__main__': +# main() diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py new file mode 100644 index 0000000000..ba2e1c9acb --- /dev/null +++ b/fast_rcnn/model.py @@ -0,0 +1,50 @@ +import torch.nn as nn +from roi_pooling import roi_pooling as _roi_pooling + +from rpn import RPN as _RPN +from faster_rcnn import FasterRCNN as _FasterRCNN + +class _Features(nn.Container): + def __init__(self): + super(_Features, self).__init__() + self.m = nn.Conv2d(3, 3, 3, 16, 1) + + def forward(self, x): + return self.m(x) + +class _Classifier(nn.Container): + def __init__(self): + super(_Classifier, self).__init__() + self.m1 = nn.Linear(3*7*7, 21) + self.m2 = nn.Linear(3*7*7, 21*4) + + def forward(self, x): + return self.m1(x), self.m2(x) + +def _pooler(x, rois): + x = _roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) + return x.view(x.size(0), -1) + +class _RPNClassifier(nn.Container): + def __init__(self, n): + super(_RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + + def forward(self, x): + return self.m1(x), self.m2(x) + +_features = _Features() +_classifier = _Classifier() +_rpn_classifier = _RPNClassifier(3) + +_rpn = _RPN( + classifier=_rpn_classifier +) + +model = _FasterRCNN( + features=_features, + pooler=_pooler, + classifier=_classifier, + rpn=_rpn +) From cfb643fe612ce66c091ae56dfb965cb2c8c94221 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 24 Dec 2016 21:47:55 -0200 Subject: [PATCH 14/18] Reduce default learning rate --- fast_rcnn/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index 2e993c1e85..a7b1a280d8 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -22,9 +22,9 @@ parser.add_argument('--model', '-m', metavar='MODEL', default='model', help='file containing model definition ' '(default: model)') -parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, +parser.add_argument('--lr', '--learning-rate', default=0.01, type=float, metavar='LR', help='initial learning rate') -parser.add_argument('--momentum', default=0.01, type=float, metavar='M', +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)') From e36a93606f7eb241c406b06292256921fa8b5be8 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sun, 25 Dec 2016 13:01:47 -0200 Subject: [PATCH 15/18] Fixes --- fast_rcnn/README.md | 8 ++++++ fast_rcnn/faster_rcnn.py | 12 ++++++--- fast_rcnn/main.py | 57 +++++++++++++++++++++++++++++++++++++--- fast_rcnn/model.py | 17 ++++++------ fast_rcnn/rpn.py | 6 +++-- fast_rcnn/voc.py | 19 +++++--------- 6 files changed, 91 insertions(+), 28 deletions(-) diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md index 4389746ab3..2d45a6711a 100644 --- a/fast_rcnn/README.md +++ b/fast_rcnn/README.md @@ -3,3 +3,11 @@ ```python python main.py PATH_TO_DATASET ``` + +## Things to add/change/consider +* where to handle the image scaling. Need to scale the annotations, and also RPN filters the minimum size wrt the original image size, and not the scaled image +* properly supporting flipping +* best way to handle different parameters in RPN/FRCNN for train/eval modes +* uniformize Variables, they should be provided by the user and not processed by me +* should image scaling be handled in FasterRCNN class? +* general code cleanup, lots of torch/numpy mixture diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 86379f775a..60d24158b1 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -34,16 +34,21 @@ def __init__(self, # should it support batched images ? def forward(self, x): - if self.training is True: + #if self.training is True: + if isinstance(x, tuple): im, gt = x else: im = x + gt = None + + assert im.size(0) == 1, 'only single element batches supported' feats = self.features(_tovar(im)) roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt) - if self.training is True: + #if self.training is True: + if gt is not None: # append gt boxes and sample fg / bg boxes # proposal_target-layer.py all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) @@ -55,7 +60,8 @@ def forward(self, x): boxes = self.bbox_reg(roi_boxes, bbox_pred, im) # apply cls + bbox reg loss here - if self.training is True: + #if self.training is True: + if gt is not None: frcnn_loss = self.frcnn_loss(scores, bbox_pred, frcnn_labels, frcnn_bbox_targets) loss = frcnn_loss + rpn_loss return loss, scores, boxes diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py index a7b1a280d8..c1301e74aa 100644 --- a/fast_rcnn/main.py +++ b/fast_rcnn/main.py @@ -1,5 +1,6 @@ import argparse import time +#from copy import deepcopy import torch import torch.nn as nn @@ -40,9 +41,15 @@ class_to_ind = dict(zip(cls, range(len(cls)))) args = parser.parse_args() -model = importlib.import_module(args.model).model +model = importlib.import_module(args.model).model() +model_test = importlib.import_module(args.model).model() +model_test.load_state_dict(model.state_dict()) -train = VOCDetection(args.data, 'train', +train_data = VOCDetection(args.data, 'train', + transform=transforms.ToTensor(), + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + +val_data = VOCDetection(args.data, 'val', transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) @@ -51,10 +58,13 @@ def collate_fn(batch): return imgs[0].unsqueeze(0), gt[0] train_loader = torch.utils.data.DataLoader( - train, batch_size=1, shuffle=True, + train_data, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn) +val_loader = torch.utils.data.DataLoader( + val_data, batch_size=1, shuffle=False, + num_workers=0, collate_fn=collate_fn) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, @@ -68,6 +78,8 @@ def train(train_loader, model, optimizer, epoch): model.train() end = time.time() for i, (im, gt) in (enumerate(train_loader)): + adjust_learning_rate(optimizer, epoch) + # measure data loading time data_time.update(time.time() - end) @@ -93,7 +105,45 @@ def train(train_loader, model, optimizer, epoch): data_time=data_time, loss=losses, #top1=top1, top5=top5 )) + #global model_test + #assert model.state_dict() == model_test.state_dict() + +def validate(val_loader, model): + batch_time = AverageMeter() + losses = AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + + for i, (im, gt) in enumerate(val_loader): + loss, scores, boxes = model((im, gt)) + losses.update(loss.data[0], im.size(0)) + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + #'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' + #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})' + .format( + i, len(val_loader), batch_time=batch_time, + #data_time=data_time, + loss=losses, + #top1=top1, top5=top5 + )) + +def adjust_learning_rate(optimizer, epoch): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + lr = args.lr * (0.1 ** (epoch // 30)) + for param_group in optimizer.state_dict()['param_groups']: + param_group['lr'] = lr class AverageMeter(object): """Computes and stores the average and current value""" @@ -114,6 +164,7 @@ def update(self, val, n=1): for epoch in range(0, 10): train(train_loader, model, optimizer, epoch) + #validate(val_loader, model) #from IPython import embed; embed() diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py index ba2e1c9acb..88cd1f95e0 100644 --- a/fast_rcnn/model.py +++ b/fast_rcnn/model.py @@ -34,17 +34,18 @@ def __init__(self, n): def forward(self, x): return self.m1(x), self.m2(x) -_features = _Features() -_classifier = _Classifier() -_rpn_classifier = _RPNClassifier(3) +def model(): + _features = _Features() + _classifier = _Classifier() + _rpn_classifier = _RPNClassifier(3) -_rpn = _RPN( + _rpn = _RPN( classifier=_rpn_classifier -) - -model = _FasterRCNN( + ) + _model = _FasterRCNN( features=_features, pooler=_pooler, classifier=_classifier, rpn=_rpn -) + ) + return _model diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index bbbe2f6096..c3608871ab 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -44,6 +44,7 @@ def __init__(self, # output rpn probs as well def forward(self, im, feats, gt=None): + assert im.size(0) == 1, 'only single element batches supported' # improve # it is used in get_anchors and also present in roi_pooling self._feat_stride = round(im.size(3)/feats.size(3)) @@ -52,7 +53,8 @@ def forward(self, im, feats, gt=None): rpn_map, rpn_bbox_pred = self.rpn_classifier(feats) all_anchors = self.rpn_get_anchors(feats) rpn_loss = None - if self.training is True: + #if self.training is True: + if gt is not None: assert gt is not None rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) # need to subsample boxes here @@ -106,7 +108,7 @@ def rpn_targets(self, all_anchors, im, gt): # keep only inside anchors anchors = all_anchors[inds_inside, :] - assert anchors.shape[0] > 0 + assert anchors.shape[0] > 0, '{0}x{1} -> {2}'.format(height,width,total_anchors) # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside), ), dtype=np.float32) diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py index 164b6b9dd4..bb2331db4f 100644 --- a/fast_rcnn/voc.py +++ b/fast_rcnn/voc.py @@ -16,32 +16,27 @@ def __init__(self, class_to_ind, keep_difficult=False): self.class_to_ind = class_to_ind def __call__(self, target): - #res = [] - #res = {} boxes = [] gt_classes = [] for obj in target.iter('object'): difficult = int(obj.find('difficult').text) == 1 if not self.keep_difficult and difficult: continue - #name = obj.find('name').text - name = obj[0].text.lower().strip() + name = obj.find('name').text.lower().strip() bb = obj.find('bndbox') - #bbox = obj[4] bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text, bb.find('xmax').text, bb.find('ymax').text]) - # supposes the order is xmin, ymin, xmax, ymax - # attention with indices - #bndbox = [int(bb.text)-1 for bb in bbox] - #res += [bndbox + [name]] - #res += [bndbox + [class_to_ind[name]]] boxes += [torch.LongTensor(bndbox)] gt_classes += [self.class_to_ind[name]] - + + size = target.find('size') + im_info = map(int,(size.find('height').text, size.find('width').text, 1)) + res = { 'boxes': torch.cat([b.view(1,-1) for b in boxes], 0), - 'gt_classes':gt_classes + 'gt_classes':gt_classes, + 'im_info': im_info } return res From e73ee5331c7fb186a00ffb7a779c4a06c9b34fc7 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sun, 25 Dec 2016 13:05:13 -0200 Subject: [PATCH 16/18] Removing unnecessary files from tree --- fast_rcnn/main_old.py | 277 ----------------------------------------- fast_rcnn/model_old.py | 29 ----- 2 files changed, 306 deletions(-) delete mode 100644 fast_rcnn/main_old.py delete mode 100644 fast_rcnn/model_old.py diff --git a/fast_rcnn/main_old.py b/fast_rcnn/main_old.py deleted file mode 100644 index 0cb7795536..0000000000 --- a/fast_rcnn/main_old.py +++ /dev/null @@ -1,277 +0,0 @@ -import torch -import torch.nn as nn -import torch.autograd as ag -import torch.utils.trainer as trainer -import torch.utils.data -import numpy as np - -from roi_pooling import roi_pooling -from voc import VOCDetection, TransformVOCDetectionAnnotation - -from tqdm import tqdm - - -cls = ('__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') -class_to_ind = dict(zip(cls, range(len(cls)))) - - -train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', - target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) - - -# TODO -# add class information in dataset -# separate in different files -# remove hard-coding 21 from Sampler -# cache the sampled boxes ? - -# image flip goes to the dataset class, not BoxSampler - -def bbox_overlaps(a, bb): - #b = b.xmin and {b.xmin,b.ymin,b.xmax,b.ymax} or b - oo = [] - - for b in bb: - - x1 = a.select(1,0).clone() - x1[x1.lt(b[0])] = b[0] - y1 = a.select(1,1).clone() - y1[y1.lt(b[1])] = b[1] - x2 = a.select(1,2).clone() - x2[x2.gt(b[2])] = b[2] - y2 = a.select(1,3).clone() - y2[y2.gt(b[3])] = b[3] - - w = x2-x1+1 - h = y2-y1+1 - inter = torch.mul(w,h).float() - aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() - barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) - - # intersection over union overlap - o = torch.div(inter , (aarea+barea-inter)) - # set invalid entries to 0 overlap - o[w.lt(0)] = 0 - o[h.lt(0)] = 0 - - oo += [o] - - return torch.cat([o.view(-1,1) for o in oo],1) - -class BoxGenerator(object): - def __init__(self, num_boxes=2000): - super(BoxGenerator, self).__init__() - self.num_boxes = num_boxes - - def __call__(self, im): - #h, w = im.size()[1:] - w, h = im.size - x = torch.LongTensor(self.num_boxes, 2).random_(0,w-1).sort(1) - y = torch.LongTensor(self.num_boxes, 2).random_(0,h-1).sort(1) - - x = x[0] - y = y[0] - - return torch.cat([x.select(1,0), y.select(1,0), x.select(1,1), y.select(1,1)], 1) - - -class BoxSampler(torch.utils.data.Dataset): - - def __init__(self, dataset, fg_threshold=0.5, bg_threshold=(0.0,0.5), - generate_boxes=BoxGenerator(num_boxes=10000)): - super(BoxSampler, self).__init__() - self.dataset = dataset - self.fg_threshold = fg_threshold - self.bg_threshold = bg_threshold - self.generate_boxes = generate_boxes - - def _overlap_and_attribute(self, boxes, gt_roidb): - - #overlaps = np.zeros((boxes.size(0), self.num_classes), dtype=np.float32) - overlaps = np.zeros((boxes.size(0), 21), dtype=np.float32) - - if gt_roidb is not None and gt_roidb['boxes'].size > 0: - gt_boxes = gt_roidb['boxes'] - gt_classes = np.array(gt_roidb['gt_classes']) - gt_overlaps = bbox_overlaps(boxes,gt_boxes).numpy() - argmaxes = gt_overlaps.argmax(axis=1) - maxes = gt_overlaps.max(axis=1) - - # remove low scoring - pos = maxes >= self.fg_threshold - neg = (maxes >= self.bg_threshold[0]) & (maxes < self.bg_threshold[1]) - maxes[neg] = 0 - - I = np.where(maxes > 0)[0] - overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] - - overlaps = overlaps[pos | neg] - boxes = boxes.numpy() - boxes = boxes[pos | neg] - return torch.from_numpy(boxes), torch.from_numpy(overlaps.argmax(axis=1)) - - def __getitem__(self, idx): - im, gt = self.dataset[idx] - boxes = self.generate_boxes(im) - boxes, labels = self._overlap_and_attribute(boxes, gt) - - if True: - w, h = im.size - im = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes())) - im = im.view(h, w, 3) - # put it from HWC to CHW format - # yikes, this transpose takes 80% of the loading time/CPU - im = im.transpose(0, 1).transpose(0, 2).contiguous() - im = im.float().div_(255) - - return im, boxes, labels - - def __len__(self): - return len(self.dataset) - - -class BoxSelector(torch.utils.data.Dataset): - def __init__(self, dataset, num_boxes=128, fg_fraction=0.25): - super(BoxSelector, self).__init__() - self.dataset = dataset - self.num_boxes = num_boxes - self.fg_fraction = fg_fraction - - def __len__(self): - return len(self.dataset) - - def __getitem__(self, idx): - im, boxes, labels = self.dataset[idx] - - boxes = boxes.numpy() - labels = labels.numpy() - - bg = np.where(labels == 0)[0] - fg = np.where(labels != 0)[0] - nfg = min(len(fg), self.num_boxes*self.fg_fraction) - nbg = min(len(bg), self.num_boxes - nfg) - - bg = bg[np.random.permutation(len(bg))[:nbg]] - fg = fg[np.random.permutation(len(fg))[:nfg]] - - I = np.concatenate([fg, bg], axis=0) - - return im, torch.from_numpy(boxes[I]), torch.from_numpy(labels[I]) - - -class ToPILImage(object): - """ Converts a torch.*Tensor of range [0, 1] and shape C x H x W - or numpy ndarray of dtype=uint8, range[0, 255] and shape H x W x C - to a PIL.Image of range [0, 255] - """ - def __call__(self, pic): - from PIL import Image, ImageOps - if isinstance(pic, np.ndarray): - # handle numpy array - img = Image.fromarray(pic) - else: - npimg = pic.mul(255).byte().numpy() - npimg = np.transpose(npimg, (1,2,0)) - img = Image.fromarray(npimg) - return img - -def make_grid(tensor, nrow=8, padding=2): - import math - """ - Given a 4D mini-batch Tensor of shape (B x C x H x W), - or a list of images all of the same size, - makes a grid of images - """ - tensorlist = None - if isinstance(tensor, list): - tensorlist = tensor - numImages = len(tensorlist) - size = torch.Size(torch.Size([long(numImages)]) + tensorlist[0].size()) - tensor = tensorlist[0].new(size) - for i in range(numImages): - tensor[i].copy_(tensorlist[i]) - if tensor.dim() == 2: # single image H x W - tensor = tensor.view(1, tensor.size(0), tensor.size(1)) - if tensor.dim() == 3: # single image - if tensor.size(0) == 1: - tensor = torch.cat((tensor, tensor, tensor), 0) - return tensor - if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images - tensor = torch.cat((tensor, tensor, tensor), 1) - # make the mini-batch of images into a grid - nmaps = tensor.size(0) - xmaps = min(nrow, nmaps) - ymaps = int(math.ceil(nmaps / xmaps)) - height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) - grid = tensor.new(3, height * ymaps, width * xmaps).fill_(tensor.max()) - k = 0 - for y in range(ymaps): - for x in range(xmaps): - if k >= nmaps: - break - grid.narrow(1, y*height+1+padding//2,height-padding)\ - .narrow(2, x*width+1+padding//2, width-padding)\ - .copy_(tensor[k]) - k = k + 1 - return grid - - - -ds = BoxSelector(BoxSampler(train, fg_threshold=0.75), 64, 0.25) - -def collate_fn(batch): - imgs, boxes, labels = zip(*batch) - max_size = [max(size) for size in zip(*[im.size() for im in imgs])] - new_imgs = imgs[0].new(len(imgs), *max_size).fill_(0) - for im, im2 in zip(new_imgs, imgs): - im.narrow(1,0,im2.size(1)).narrow(2,0,im2.size(2)).copy_(im2) - boxes = np.concatenate([np.column_stack((np.full(t.size(0), i, dtype=np.int64), t.numpy())) for i, t in enumerate(boxes, 0)], axis=0) - boxes = torch.from_numpy(boxes) - labels = torch.cat(labels, 0) - return new_imgs, boxes, labels - -train_loader = torch.utils.data.DataLoader( - ds, batch_size=2, shuffle=True, num_workers=2, collate_fn=collate_fn) - - -def show(img, boxes, label, cls=None): - from PIL import Image, ImageDraw - #img, target = self.__getitem__(index) - if cls is None: - cls = ('__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') - - draw = ImageDraw.Draw(img) - for obj, t in zip(boxes, label): - if t > 0: - draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) - draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) - else: - #pass - draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) - img.show() - - -for i, (img, boxes, labels) in tqdm(enumerate(train_loader)): - #grid = make_grid(img, 2, 1) - #grid = ToPILImage()(grid) - #grid.show() - #break - pass - #print('====') - #print(i) - #print(img.size()) - #print(boxes.size()) - #print(labels.size()) - -#im, box, label = ds[10] -#show(im,box,label) diff --git a/fast_rcnn/model_old.py b/fast_rcnn/model_old.py deleted file mode 100644 index 44d7dddf9b..0000000000 --- a/fast_rcnn/model_old.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch.nn as nn -from roi_pooling import roi_pooling - -class Network(nn.Container): - - def __init__(self, features, classifier): - super(Network, self).__init__() - self.features = features - self.classifier = classifier - - def forward(self, x): - images, rois = x - x = self.features(images) - x = roi_pooling(x, rois, size=(3,3), spatial_scale=1.0/16.0) - x = self.classifier(x) - return x - -def basic_net(): - features = nn.Sequential(nn.Conv2d(3,16,3,16,1,1)) - classifier = nn.Sequential(nn.Linear(3*3*16,10)) - return Network(features, classifier) - -if __name__ == '__main__': - import torch - import torch.autograd - m = basic_net() - x = torch.autograd.Variable(torch.rand(1,3,224,224)) - b = torch.autograd.Variable(torch.LongTensor([[0,1,50,200,200],[0,50,50,200,200]])) - o = m((x,b)) From 79c2402c4182c940850ed2649e8b1ae23a066460 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sun, 25 Dec 2016 13:08:08 -0200 Subject: [PATCH 17/18] Rename --- fast_rcnn/faster_rcnn.py | 4 ++-- fast_rcnn/rpn.py | 4 ++-- fast_rcnn/{bbox_transform.py => utils.py} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename fast_rcnn/{bbox_transform.py => utils.py} (100%) diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py index 60d24158b1..fbd9434e7d 100644 --- a/fast_rcnn/faster_rcnn.py +++ b/fast_rcnn/faster_rcnn.py @@ -4,10 +4,10 @@ import numpy as np import numpy.random as npr -from bbox_transform import \ +from utils import \ bbox_transform, bbox_transform_inv, clip_boxes, bbox_overlaps -from bbox_transform import to_var as _tovar +from utils import to_var as _tovar # should handle multiple scales, how? class FasterRCNN(nn.Container): diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index c3608871ab..6faff8eaa6 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -5,10 +5,10 @@ import numpy.random as npr # clean up environment -from bbox_transform import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps +from utils import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps from generate_anchors import generate_anchors -from bbox_transform import to_var as _tovar +from utils import to_var as _tovar from py_cpu_nms import py_cpu_nms as nms diff --git a/fast_rcnn/bbox_transform.py b/fast_rcnn/utils.py similarity index 100% rename from fast_rcnn/bbox_transform.py rename to fast_rcnn/utils.py From d8d378c31d2766009db400ac03f41dd837a56c2a Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sun, 25 Dec 2016 16:28:46 -0200 Subject: [PATCH 18/18] minor changes --- fast_rcnn/README.md | 5 +++-- fast_rcnn/rpn.py | 15 ++++++++++----- fast_rcnn/voc.py | 11 +++++++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md index 2d45a6711a..fb1aab88ba 100644 --- a/fast_rcnn/README.md +++ b/fast_rcnn/README.md @@ -6,8 +6,9 @@ python main.py PATH_TO_DATASET ## Things to add/change/consider * where to handle the image scaling. Need to scale the annotations, and also RPN filters the minimum size wrt the original image size, and not the scaled image +* should image scaling be handled in FasterRCNN class? * properly supporting flipping * best way to handle different parameters in RPN/FRCNN for train/eval modes -* uniformize Variables, they should be provided by the user and not processed by me -* should image scaling be handled in FasterRCNN class? +* uniformize Variables, they should be provided by the user and not processed by FasterRCNN/RPN classes * general code cleanup, lots of torch/numpy mixture +* should I use a general config file? diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py index 6faff8eaa6..b7421a2fc1 100644 --- a/fast_rcnn/rpn.py +++ b/fast_rcnn/rpn.py @@ -66,10 +66,13 @@ def forward(self, im, feats, gt=None): # proposal_layer.py roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_pred, im) # only for visualization - #roi_boxes = all_anchors + if False: + roi_boxes = all_anchors + return _tovar((roi_boxes, scores, rpn_loss, rpn_labels)) return _tovar((roi_boxes, scores, rpn_loss)) + # from faster rcnn py def rpn_get_anchors(self, im): height, width = im.size()[-2:] @@ -248,7 +251,9 @@ def _unmap(data, count, inds, fill=0): def show(img, boxes, label): from PIL import Image, ImageDraw + import torchvision.transforms as transforms #img, target = self.__getitem__(index) + img = transforms.ToPILImage()(img) draw = ImageDraw.Draw(img) for obj, t in zip(boxes, label): #print(type(t)) @@ -258,8 +263,8 @@ def show(img, boxes, label): #draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) #else: elif t == 0: - pass - #draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + #pass + draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) img.show() @@ -293,7 +298,7 @@ def forward(self, x): transform=transforms.ToTensor(), target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) - im, gt = train[11] + im, gt = train[100] im0 = im im = im.unsqueeze(0) @@ -312,6 +317,6 @@ def forward(self, x): print loss loss.backward() - #show(im0, boxes.data, labels.data.int().tolist()) + show(im0, boxes.data, labels.data.int().tolist()) #from IPython import embed; embed() diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py index bb2331db4f..1eb0b0e0ed 100644 --- a/fast_rcnn/voc.py +++ b/fast_rcnn/voc.py @@ -9,6 +9,13 @@ else: import xml.etree.ElementTree as ET +def _flip_box(boxes, width): + boxes = boxes.clone() + oldx1 = boxes[:, 0].clone() + oldx2 = boxes[:, 2].clone() + boxes[:, 0] = width - oldx2 - 1 + boxes[:, 2] = width - oldx1 - 1 + return boxes class TransformVOCDetectionAnnotation(object): def __init__(self, class_to_ind, keep_difficult=False): @@ -27,14 +34,14 @@ def __call__(self, target): bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text, bb.find('xmax').text, bb.find('ymax').text]) - boxes += [torch.LongTensor(bndbox)] + boxes += [bndbox] gt_classes += [self.class_to_ind[name]] size = target.find('size') im_info = map(int,(size.find('height').text, size.find('width').text, 1)) res = { - 'boxes': torch.cat([b.view(1,-1) for b in boxes], 0), + 'boxes': torch.LongTensor(boxes), 'gt_classes':gt_classes, 'im_info': im_info }