controlnet training resize inputs to multiple of 8 (#3135)

williamberman · web-flow · commit 7e6886f5e93c · 2023-04-19T10:46:51.000-07:00
controlnet training center crop input images to multiple of 8

The pipeline code resizes inputs to multiples of 8.
Not doing this resizing in the training script is causing
the encoded image to have different height/width dimensions
than the encoded conditioning image (which uses a separate
encoder that's part of the controlnet model).

We resize and center crop the inputs to make sure they're the
same size (as well as all other images in the batch). We also
check that the initial resolution is a multiple of 8.
diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
@@ -525,6 +525,11 @@ def parse_args(input_args=None):
             " or the same number of `--validation_prompt`s and `--validation_image`s"
         )
 
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
     return args
 
 
@@ -607,6 +612,7 @@ def tokenize_captions(examples, is_train=True):
     image_transforms = transforms.Compose(
         [
             transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
             transforms.ToTensor(),
             transforms.Normalize([0.5], [0.5]),
         ]
@@ -615,6 +621,7 @@ def tokenize_captions(examples, is_train=True):
     conditioning_image_transforms = transforms.Compose(
         [
             transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
             transforms.ToTensor(),
         ]
     )

Original file line number	Diff line number	Diff line change
`@@ -525,6 +525,11 @@ def parse_args(input_args=None):`
`525`	`525`	" or the same number of `--validation_prompt`s and `--validation_image`s"
`526`	`526`	`)`
`527`	`527`
	`528`	`+ if args.resolution % 8 != 0:`
	`529`	`+ raise ValueError(`
	`530`	+ "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
	`531`	`+ )`
	`532`	`+`
`528`	`533`	`return args`
`529`	`534`
`530`	`535`
`@@ -607,6 +612,7 @@ def tokenize_captions(examples, is_train=True):`
`607`	`612`	`image_transforms = transforms.Compose(`
`608`	`613`	`[`
`609`	`614`	`transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),`
	`615`	`+ transforms.CenterCrop(args.resolution),`
`610`	`616`	`transforms.ToTensor(),`
`611`	`617`	`transforms.Normalize([0.5], [0.5]),`
`612`	`618`	`]`
`@@ -615,6 +621,7 @@ def tokenize_captions(examples, is_train=True):`
`615`	`621`	`conditioning_image_transforms = transforms.Compose(`
`616`	`622`	`[`
`617`	`623`	`transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),`
	`624`	`+ transforms.CenterCrop(args.resolution),`
`618`	`625`	`transforms.ToTensor(),`
`619`	`626`	`]`
`620`	`627`	`)`