From 4228e246afff799e2c224ba9201216a9afcc507b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 3 Jul 2023 20:55:57 +0000
Subject: [PATCH 1/3] revert automatic chunking

---
 .../source/en/api/pipelines/text_to_video.mdx | 24 ++++++++++++++++++-
 .../pipeline_text_to_video_synth.py           |  3 ---
 .../pipeline_text_to_video_synth_img2img.py   |  3 ---
 3 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx
index 75868d7dd6ea..b9ed331da13b 100644
--- a/docs/source/en/api/pipelines/text_to_video.mdx
+++ b/docs/source/en/api/pipelines/text_to_video.mdx
@@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
 pipe.enable_model_cpu_offload()
 
 # memory optimization
+self.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 
 prompt = "Darth Vader surfing a wave"
@@ -150,10 +151,13 @@ Now the video can be upscaled:
 
 ```py
 pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16)
-pipe.vae.enable_slicing()
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
 
+# memory optimization
+self.unet.enable_forward_chunking(chunk_size=1, dim=1)
+pipe.enable_vae_slicing()
+
 video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
 
 video_frames = pipe(prompt, video=video, strength=0.6).frames
@@ -175,6 +179,24 @@ Here are some sample outputs:
     </tr>
 </table>
 
+### Memory optimizations
+
+Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both
+when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce 
+memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable
+**forward chunking** and **vae slicing**:
+
+Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and 
+allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames`
+dimension by doing:
+
+```py
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
+```
+
+Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also 
+gives significant memory savings since the two pipelines decode all image frames at once.
+
 ## Available checkpoints 
 
 * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index 680a524732e9..e30f183808a5 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -634,9 +634,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
-
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
index 1b6cd9c2b392..ce5109a58213 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -709,9 +709,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
-
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:

From 7a83608039e469c96d8d7691a86e34919aa787ec Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 3 Jul 2023 22:58:12 +0200
Subject: [PATCH 2/3] Apply suggestions from code review

---
 docs/source/en/api/pipelines/text_to_video.mdx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx
index b9ed331da13b..cdeeb9ea3ff9 100644
--- a/docs/source/en/api/pipelines/text_to_video.mdx
+++ b/docs/source/en/api/pipelines/text_to_video.mdx
@@ -138,7 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
 pipe.enable_model_cpu_offload()
 
 # memory optimization
-self.unet.enable_forward_chunking(chunk_size=1, dim=1)
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 
 prompt = "Darth Vader surfing a wave"
@@ -155,7 +155,7 @@ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
 
 # memory optimization
-self.unet.enable_forward_chunking(chunk_size=1, dim=1)
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 
 video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
@@ -197,6 +197,9 @@ pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also 
 gives significant memory savings since the two pipelines decode all image frames at once.
 
+```py
+pipe.enable_vae_slicing()
+
 ## Available checkpoints 
 
 * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/)

From e51d7b877bbf274e4ac1f2d7baa4cad36dcc1557 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 3 Jul 2023 20:59:28 +0000
Subject: [PATCH 3/3] revert automatic chunking

---
 docs/source/en/api/pipelines/text_to_video.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx
index cdeeb9ea3ff9..583d461ea948 100644
--- a/docs/source/en/api/pipelines/text_to_video.mdx
+++ b/docs/source/en/api/pipelines/text_to_video.mdx
@@ -199,6 +199,7 @@ gives significant memory savings since the two pipelines decode all image frames
 
 ```py
 pipe.enable_vae_slicing()
+```
 
 ## Available checkpoints