revert automatic chunking (#3934)

patrickvonplaten · web-flow · commit 4e898560cefb · 2023-07-03T23:12:41.000+02:00
* revert automatic chunking

* Apply suggestions from code review

* revert automatic chunking
diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx
@@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
 pipe.enable_model_cpu_offload()
 
 # memory optimization
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 
 prompt = "Darth Vader surfing a wave"
@@ -150,10 +151,13 @@ Now the video can be upscaled:
 
 ```py
 pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16)
-pipe.vae.enable_slicing()
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
 
+# memory optimization
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
+pipe.enable_vae_slicing()
+
 video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
 
 video_frames = pipe(prompt, video=video, strength=0.6).frames
@@ -175,6 +179,28 @@ Here are some sample outputs:
     </tr>
 </table>
 
+### Memory optimizations
+
+Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both
+when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce 
+memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable
+**forward chunking** and **vae slicing**:
+
+Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and 
+allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames`
+dimension by doing:
+
+```py
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
+```
+
+Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also 
+gives significant memory savings since the two pipelines decode all image frames at once.
+
+```py
+pipe.enable_vae_slicing()
+```
+
 ## Available checkpoints 
 
 * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -634,9 +634,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
-
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -709,9 +709,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
-
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar: