From 3033c9d3cf72cbdc4dd1c0954e22c2b13791ea1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?= Date: Fri, 24 Apr 2020 18:38:15 +0200 Subject: [PATCH 1/3] Fix training resuming in references/segmentation --- references/segmentation/train.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/references/segmentation/train.py b/references/segmentation/train.py index b1173d5323a..e37a4e92886 100644 --- a/references/segmentation/train.py +++ b/references/segmentation/train.py @@ -128,10 +128,6 @@ def main(args): if args.distributed: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) - if args.resume: - checkpoint = torch.load(args.resume, map_location='cpu') - model.load_state_dict(checkpoint['model']) - model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) @@ -157,8 +153,15 @@ def main(args): optimizer, lambda x: (1 - x / (len(data_loader) * args.epochs)) ** 0.9) + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + start_time = time.time() - for epoch in range(args.epochs): + for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args.print_freq) @@ -168,6 +171,7 @@ def main(args): { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, @@ -201,6 +205,8 @@ def parse_args(): parser.add_argument('--print-freq', default=10, type=int, help='print frequency') parser.add_argument('--output-dir', default='.', help='path where to save') parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='start epoch') parser.add_argument( "--test-only", dest="test_only", From 181f81abb2faec83b39de56e800649ff387a1b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?= Date: Fri, 10 Sep 2021 14:59:50 +0200 Subject: [PATCH 2/3] Clarification for training resnext101_32x8d --- references/classification/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/references/classification/README.md b/references/classification/README.md index e0b7f210175..6e7a3b4f37f 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -40,12 +40,16 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ ### ResNext-101 32x8d -On 8 nodes, each with 8 GPUs (for a total of 64 GPUS) ``` python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ --model resnext101_32x8d --epochs 100 ``` +Note that the above command corresponds to a single node with 8 GPUs. If you use +a different number of GPUs and/or a different batch size, then the learning rate +should be scaled accordingly. For example, the pretrained model provided by +`torchvision` was trained on 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +with `--batch_size 16` and `--lr 0.4`. ### MobileNetV2 ``` From d64f46cb195b020d9196564c01ddfe84e0dbf4a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?= Date: Fri, 10 Sep 2021 15:15:18 +0200 Subject: [PATCH 3/3] Update references/classification/README.md Co-authored-by: Nicolas Hug --- references/classification/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/references/classification/README.md b/references/classification/README.md index 6e7a3b4f37f..519d31fc3fe 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -49,7 +49,8 @@ Note that the above command corresponds to a single node with 8 GPUs. If you use a different number of GPUs and/or a different batch size, then the learning rate should be scaled accordingly. For example, the pretrained model provided by `torchvision` was trained on 8 nodes, each with 8 GPUs (for a total of 64 GPUs), -with `--batch_size 16` and `--lr 0.4`. +with `--batch_size 16` and `--lr 0.4`, instead of the current defaults +which are respectively batch_size=32 and lr=0.1 ### MobileNetV2 ```