Skip to content

Commit c398dad

Browse files
authored
Merge branch 'main' into proto-bbox-affine
2 parents d08d335 + 3aa2a93 commit c398dad

File tree

7 files changed

+162
-48
lines changed

7 files changed

+162
-48
lines changed

references/optical_flow/train.py

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,21 @@ def get_train_dataset(stage, dataset_root):
6060

6161

6262
@torch.no_grad()
63-
def _validate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, batch_size=None, header=None):
63+
def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, batch_size=None, header=None):
6464
"""Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
6565
6666
We process as many samples as possible with ddp, and process the rest on a single worker.
6767
"""
6868
batch_size = batch_size or args.batch_size
69+
device = torch.device(args.device)
6970

7071
model.eval()
7172

72-
sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
73+
if args.distributed:
74+
sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
75+
else:
76+
sampler = torch.utils.data.SequentialSampler(val_dataset)
77+
7378
val_loader = torch.utils.data.DataLoader(
7479
val_dataset,
7580
sampler=sampler,
@@ -88,7 +93,7 @@ def inner_loop(blob):
8893
image1, image2, flow_gt = blob[:3]
8994
valid_flow_mask = None if len(blob) == 3 else blob[-1]
9095

91-
image1, image2 = image1.cuda(), image2.cuda()
96+
image1, image2 = image1.to(device), image2.to(device)
9297

9398
padder = utils.InputPadder(image1.shape, mode=padder_mode)
9499
image1, image2 = padder.pad(image1, image2)
@@ -115,21 +120,22 @@ def inner_loop(blob):
115120
inner_loop(blob)
116121
num_processed_samples += blob[0].shape[0] # batch size
117122

118-
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
119-
print(
120-
f"Batch-processed {num_processed_samples} / {len(val_dataset)} samples. "
121-
"Going to process the remaining samples individually, if any."
122-
)
123+
if args.distributed:
124+
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
125+
print(
126+
f"Batch-processed {num_processed_samples} / {len(val_dataset)} samples. "
127+
"Going to process the remaining samples individually, if any."
128+
)
129+
if args.rank == 0: # we only need to process the rest on a single worker
130+
for i in range(num_processed_samples, len(val_dataset)):
131+
inner_loop(val_dataset[i])
123132

124-
if args.rank == 0: # we only need to process the rest on a single worker
125-
for i in range(num_processed_samples, len(val_dataset)):
126-
inner_loop(val_dataset[i])
133+
logger.synchronize_between_processes()
127134

128-
logger.synchronize_between_processes()
129135
print(header, logger)
130136

131137

132-
def validate(model, args):
138+
def evaluate(model, args):
133139
val_datasets = args.val_dataset or []
134140

135141
if args.prototype:
@@ -145,21 +151,21 @@ def validate(model, args):
145151
if name == "kitti":
146152
# Kitti has different image sizes so we need to individually pad them, we can't batch.
147153
# see comment in InputPadder
148-
if args.batch_size != 1 and args.rank == 0:
154+
if args.batch_size != 1 and (not args.distributed or args.rank == 0):
149155
warnings.warn(
150156
f"Batch-size={args.batch_size} was passed. For technical reasons, evaluating on Kitti can only be done with a batch-size of 1."
151157
)
152158

153159
val_dataset = KittiFlow(root=args.dataset_root, split="train", transforms=preprocessing)
154-
_validate(
160+
_evaluate(
155161
model, args, val_dataset, num_flow_updates=24, padder_mode="kitti", header="Kitti val", batch_size=1
156162
)
157163
elif name == "sintel":
158164
for pass_name in ("clean", "final"):
159165
val_dataset = Sintel(
160166
root=args.dataset_root, split="train", pass_name=pass_name, transforms=preprocessing
161167
)
162-
_validate(
168+
_evaluate(
163169
model,
164170
args,
165171
val_dataset,
@@ -172,11 +178,12 @@ def validate(model, args):
172178

173179

174180
def train_one_epoch(model, optimizer, scheduler, train_loader, logger, args):
181+
device = torch.device(args.device)
175182
for data_blob in logger.log_every(train_loader):
176183

177184
optimizer.zero_grad()
178185

179-
image1, image2, flow_gt, valid_flow_mask = (x.cuda() for x in data_blob)
186+
image1, image2, flow_gt, valid_flow_mask = (x.to(device) for x in data_blob)
180187
flow_predictions = model(image1, image2, num_flow_updates=args.num_flow_updates)
181188

182189
loss = utils.sequence_loss(flow_predictions, flow_gt, valid_flow_mask, args.gamma)
@@ -200,36 +207,68 @@ def main(args):
200207
raise ValueError("The weights parameter works only in prototype mode. Please pass the --prototype argument.")
201208
utils.setup_ddp(args)
202209

210+
if args.distributed and args.device == "cpu":
211+
raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
212+
device = torch.device(args.device)
213+
203214
if args.prototype:
204215
model = prototype.models.optical_flow.__dict__[args.model](weights=args.weights)
205216
else:
206217
model = torchvision.models.optical_flow.__dict__[args.model](pretrained=args.pretrained)
207218

208-
model = model.to(args.local_rank)
209-
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
219+
if args.distributed:
220+
model = model.to(args.local_rank)
221+
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
222+
model_without_ddp = model.module
223+
else:
224+
model.to(device)
225+
model_without_ddp = model
210226

211227
if args.resume is not None:
212-
d = torch.load(args.resume, map_location="cpu")
213-
model.load_state_dict(d, strict=True)
228+
checkpoint = torch.load(args.resume, map_location="cpu")
229+
model_without_ddp.load_state_dict(checkpoint["model"])
214230

215231
if args.train_dataset is None:
216232
# Set deterministic CUDNN algorithms, since they can affect epe a fair bit.
217233
torch.backends.cudnn.benchmark = False
218234
torch.backends.cudnn.deterministic = True
219-
validate(model, args)
235+
evaluate(model, args)
220236
return
221237

222238
print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
223239

240+
train_dataset = get_train_dataset(args.train_dataset, args.dataset_root)
241+
242+
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, eps=args.adamw_eps)
243+
244+
scheduler = torch.optim.lr_scheduler.OneCycleLR(
245+
optimizer=optimizer,
246+
max_lr=args.lr,
247+
epochs=args.epochs,
248+
steps_per_epoch=ceil(len(train_dataset) / (args.world_size * args.batch_size)),
249+
pct_start=0.05,
250+
cycle_momentum=False,
251+
anneal_strategy="linear",
252+
)
253+
254+
if args.resume is not None:
255+
optimizer.load_state_dict(checkpoint["optimizer"])
256+
scheduler.load_state_dict(checkpoint["scheduler"])
257+
args.start_epoch = checkpoint["epoch"] + 1
258+
else:
259+
args.start_epoch = 0
260+
224261
torch.backends.cudnn.benchmark = True
225262

226263
model.train()
227264
if args.freeze_batch_norm:
228265
utils.freeze_batch_norm(model.module)
229266

230-
train_dataset = get_train_dataset(args.train_dataset, args.dataset_root)
267+
if args.distributed:
268+
sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True, drop_last=True)
269+
else:
270+
sampler = torch.utils.data.RandomSampler(train_dataset)
231271

232-
sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True, drop_last=True)
233272
train_loader = torch.utils.data.DataLoader(
234273
train_dataset,
235274
sampler=sampler,
@@ -238,25 +277,15 @@ def main(args):
238277
num_workers=args.num_workers,
239278
)
240279

241-
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, eps=args.adamw_eps)
242-
243-
scheduler = torch.optim.lr_scheduler.OneCycleLR(
244-
optimizer=optimizer,
245-
max_lr=args.lr,
246-
epochs=args.epochs,
247-
steps_per_epoch=ceil(len(train_dataset) / (args.world_size * args.batch_size)),
248-
pct_start=0.05,
249-
cycle_momentum=False,
250-
anneal_strategy="linear",
251-
)
252-
253280
logger = utils.MetricLogger()
254281

255282
done = False
256-
for current_epoch in range(args.epochs):
283+
for current_epoch in range(args.start_epoch, args.epochs):
257284
print(f"EPOCH {current_epoch}")
285+
if args.distributed:
286+
# needed on distributed mode, otherwise the data loading order would be the same for all epochs
287+
sampler.set_epoch(current_epoch)
258288

259-
sampler.set_epoch(current_epoch) # needed, otherwise the data loading order would be the same for all epochs
260289
train_one_epoch(
261290
model=model,
262291
optimizer=optimizer,
@@ -269,13 +298,19 @@ def main(args):
269298
# Note: we don't sync the SmoothedValues across processes, so the printed metrics are just those of rank 0
270299
print(f"Epoch {current_epoch} done. ", logger)
271300

272-
if args.rank == 0:
273-
# TODO: Also save the optimizer and scheduler
274-
torch.save(model.state_dict(), Path(args.output_dir) / f"{args.name}_{current_epoch}.pth")
275-
torch.save(model.state_dict(), Path(args.output_dir) / f"{args.name}.pth")
301+
if not args.distributed or args.rank == 0:
302+
checkpoint = {
303+
"model": model_without_ddp.state_dict(),
304+
"optimizer": optimizer.state_dict(),
305+
"scheduler": scheduler.state_dict(),
306+
"epoch": current_epoch,
307+
"args": args,
308+
}
309+
torch.save(checkpoint, Path(args.output_dir) / f"{args.name}_{current_epoch}.pth")
310+
torch.save(checkpoint, Path(args.output_dir) / f"{args.name}.pth")
276311

277312
if current_epoch % args.val_freq == 0 or done:
278-
validate(model, args)
313+
evaluate(model, args)
279314
model.train()
280315
if args.freeze_batch_norm:
281316
utils.freeze_batch_norm(model.module)
@@ -349,6 +384,7 @@ def get_args_parser(add_help=True):
349384
action="store_true",
350385
)
351386
parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load.")
387+
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu, Default: cuda)")
352388

353389
return parser
354390

references/optical_flow/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,12 @@ def setup_ddp(args):
256256
# if we're here, the script was called by run_with_submitit.py
257257
args.local_rank = args.gpu
258258
else:
259-
raise ValueError(r"Sorry, I can't set up the distributed training ¯\_(ツ)_/¯.")
259+
print("Not using distributed mode!")
260+
args.distributed = False
261+
args.world_size = 1
262+
return
263+
264+
args.distributed = True
260265

261266
_redefine_print(is_main=(args.rank == 0))
262267

test/test_prototype_transforms.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
import pytest
44
import torch
5+
from common_utils import assert_equal
56
from test_prototype_transforms_functional import make_images, make_bounding_boxes, make_one_hot_labels
67
from torchvision.prototype import transforms, features
7-
from torchvision.transforms.functional import to_pil_image
8+
from torchvision.transforms.functional import to_pil_image, pil_to_tensor
89

910

1011
def make_vanilla_tensor_images(*args, **kwargs):
@@ -66,10 +67,10 @@ def parametrize_from_transforms(*transforms):
6667
class TestSmoke:
6768
@parametrize_from_transforms(
6869
transforms.RandomErasing(p=1.0),
69-
transforms.HorizontalFlip(),
7070
transforms.Resize([16, 16]),
7171
transforms.CenterCrop([16, 16]),
7272
transforms.ConvertImageDtype(),
73+
transforms.RandomHorizontalFlip(),
7374
)
7475
def test_common(self, transform, input):
7576
transform(input)
@@ -188,3 +189,56 @@ def test_random_resized_crop(self, transform, input):
188189
)
189190
def test_convert_image_color_space(self, transform, input):
190191
transform(input)
192+
193+
194+
@pytest.mark.parametrize("p", [0.0, 1.0])
195+
class TestRandomHorizontalFlip:
196+
def input_expected_image_tensor(self, p, dtype=torch.float32):
197+
input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype)
198+
expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype)
199+
200+
return input, expected if p == 1 else input
201+
202+
def test_simple_tensor(self, p):
203+
input, expected = self.input_expected_image_tensor(p)
204+
transform = transforms.RandomHorizontalFlip(p=p)
205+
206+
actual = transform(input)
207+
208+
assert_equal(expected, actual)
209+
210+
def test_pil_image(self, p):
211+
input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8)
212+
transform = transforms.RandomHorizontalFlip(p=p)
213+
214+
actual = transform(to_pil_image(input))
215+
216+
assert_equal(expected, pil_to_tensor(actual))
217+
218+
def test_features_image(self, p):
219+
input, expected = self.input_expected_image_tensor(p)
220+
transform = transforms.RandomHorizontalFlip(p=p)
221+
222+
actual = transform(features.Image(input))
223+
224+
assert_equal(features.Image(expected), actual)
225+
226+
def test_features_segmentation_mask(self, p):
227+
input, expected = self.input_expected_image_tensor(p)
228+
transform = transforms.RandomHorizontalFlip(p=p)
229+
230+
actual = transform(features.SegmentationMask(input))
231+
232+
assert_equal(features.SegmentationMask(expected), actual)
233+
234+
def test_features_bounding_box(self, p):
235+
input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10))
236+
transform = transforms.RandomHorizontalFlip(p=p)
237+
238+
actual = transform(input)
239+
240+
expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input
241+
expected = features.BoundingBox.new_like(input, data=expected_image_tensor)
242+
assert_equal(expected, actual)
243+
assert actual.format == expected.format
244+
assert actual.image_size == expected.image_size

torchvision/prototype/transforms/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
from ._auto_augment import RandAugment, TrivialAugmentWide, AutoAugment, AugMix
99
from ._container import Compose, RandomApply, RandomChoice, RandomOrder
1010
from ._geometry import (
11-
HorizontalFlip,
1211
Resize,
1312
CenterCrop,
1413
RandomResizedCrop,
1514
FiveCrop,
1615
TenCrop,
1716
BatchMultiCrop,
17+
RandomHorizontalFlip,
1818
RandomZoomOut,
1919
)
2020
from ._meta import ConvertBoundingBoxFormat, ConvertImageDtype, ConvertImageColorSpace

torchvision/prototype/transforms/_geometry.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,25 @@
1313
from ._utils import query_image, get_image_dimensions, has_any, is_simple_tensor
1414

1515

16-
class HorizontalFlip(Transform):
16+
class RandomHorizontalFlip(Transform):
17+
def __init__(self, p: float = 0.5) -> None:
18+
super().__init__()
19+
self.p = p
20+
21+
def forward(self, *inputs: Any) -> Any:
22+
sample = inputs if len(inputs) > 1 else inputs[0]
23+
if torch.rand(1) >= self.p:
24+
return sample
25+
26+
return super().forward(sample)
27+
1728
def _transform(self, input: Any, params: Dict[str, Any]) -> Any:
1829
if isinstance(input, features.Image):
1930
output = F.horizontal_flip_image_tensor(input)
2031
return features.Image.new_like(input, output)
32+
elif isinstance(input, features.SegmentationMask):
33+
output = F.horizontal_flip_segmentation_mask(input)
34+
return features.SegmentationMask.new_like(input, output)
2135
elif isinstance(input, features.BoundingBox):
2236
output = F.horizontal_flip_bounding_box(input, format=input.format, image_size=input.image_size)
2337
return features.BoundingBox.new_like(input, output)

torchvision/prototype/transforms/functional/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
horizontal_flip_bounding_box,
4141
horizontal_flip_image_tensor,
4242
horizontal_flip_image_pil,
43+
horizontal_flip_segmentation_mask,
4344
resize_bounding_box,
4445
resize_image_tensor,
4546
resize_image_pil,

torchvision/prototype/transforms/functional/_geometry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
horizontal_flip_image_pil = _FP.hflip
1616

1717

18+
def horizontal_flip_segmentation_mask(segmentation_mask: torch.Tensor) -> torch.Tensor:
19+
return horizontal_flip_image_tensor(segmentation_mask)
20+
21+
1822
def horizontal_flip_bounding_box(
1923
bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int]
2024
) -> torch.Tensor:

0 commit comments

Comments
 (0)