From 3033c9d3cf72cbdc4dd1c0954e22c2b13791ea1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?= <huudienkhue.le@gmail.com>
Date: Fri, 24 Apr 2020 18:38:15 +0200
Subject: [PATCH 1/3] Fix training resuming in references/segmentation

---
 references/segmentation/train.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index b1173d5323a..e37a4e92886 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -128,10 +128,6 @@ def main(args):
     if args.distributed:
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
 
-    if args.resume:
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        model.load_state_dict(checkpoint['model'])
-
     model_without_ddp = model
     if args.distributed:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
@@ -157,8 +153,15 @@ def main(args):
         optimizer,
         lambda x: (1 - x / (len(data_loader) * args.epochs)) ** 0.9)
 
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+
     start_time = time.time()
-    for epoch in range(args.epochs):
+    for epoch in range(args.start_epoch, args.epochs):
         if args.distributed:
             train_sampler.set_epoch(epoch)
         train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args.print_freq)
@@ -168,6 +171,7 @@ def main(args):
             {
                 'model': model_without_ddp.state_dict(),
                 'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
                 'epoch': epoch,
                 'args': args
             },
@@ -201,6 +205,8 @@ def parse_args():
     parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
     parser.add_argument('--output-dir', default='.', help='path where to save')
     parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
     parser.add_argument(
         "--test-only",
         dest="test_only",

From 181f81abb2faec83b39de56e800649ff387a1b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?= <huudienkhue.le@gmail.com>
Date: Fri, 10 Sep 2021 14:59:50 +0200
Subject: [PATCH 2/3] Clarification for training resnext101_32x8d

---
 references/classification/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/references/classification/README.md b/references/classification/README.md
index e0b7f210175..6e7a3b4f37f 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -40,12 +40,16 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
 
 ### ResNext-101 32x8d
 
-On 8 nodes, each with 8 GPUs (for a total of 64 GPUS)
 ```
 python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
     --model resnext101_32x8d --epochs 100
 ```
 
+Note that the above command corresponds to a single node with 8 GPUs. If you use
+a different number of GPUs and/or a different batch size, then the learning rate
+should be scaled accordingly. For example, the pretrained model provided by
+`torchvision` was trained on 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
+with `--batch_size 16` and `--lr 0.4`.
 
 ### MobileNetV2
 ```

From d64f46cb195b020d9196564c01ddfe84e0dbf4a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=2E=20Khu=C3=AA=20L=C3=AA-Huu?=
 <netw0rkf10w@users.noreply.github.com>
Date: Fri, 10 Sep 2021 15:15:18 +0200
Subject: [PATCH 3/3] Update references/classification/README.md

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 references/classification/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/references/classification/README.md b/references/classification/README.md
index 6e7a3b4f37f..519d31fc3fe 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -49,7 +49,8 @@ Note that the above command corresponds to a single node with 8 GPUs. If you use
 a different number of GPUs and/or a different batch size, then the learning rate
 should be scaled accordingly. For example, the pretrained model provided by
 `torchvision` was trained on 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
-with `--batch_size 16` and `--lr 0.4`.
+with `--batch_size 16` and `--lr 0.4`, instead of the current defaults
+which are respectively batch_size=32 and lr=0.1
 
 ### MobileNetV2
 ```