From 4228e246afff799e2c224ba9201216a9afcc507b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 20:55:57 +0000 Subject: [PATCH 1/3] revert automatic chunking --- .../source/en/api/pipelines/text_to_video.mdx | 24 ++++++++++++++++++- .../pipeline_text_to_video_synth.py | 3 --- .../pipeline_text_to_video_synth_img2img.py | 3 --- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index 75868d7dd6ea..b9ed331da13b 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt pipe.enable_model_cpu_offload() # memory optimization +self.unet.enable_forward_chunking(chunk_size=1, dim=1) pipe.enable_vae_slicing() prompt = "Darth Vader surfing a wave" @@ -150,10 +151,13 @@ Now the video can be upscaled: ```py pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) -pipe.vae.enable_slicing() pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() +# memory optimization +self.unet.enable_forward_chunking(chunk_size=1, dim=1) +pipe.enable_vae_slicing() + video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] video_frames = pipe(prompt, video=video, strength=0.6).frames @@ -175,6 +179,24 @@ Here are some sample outputs: +### Memory optimizations + +Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both +when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce +memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable +**forward chunking** and **vae slicing**: + +Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and +allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames` +dimension by doing: + +```py +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) +``` + +Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also +gives significant memory savings since the two pipelines decode all image frames at once. + ## Available checkpoints * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 680a524732e9..e30f183808a5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -634,9 +634,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 1b6cd9c2b392..ce5109a58213 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -709,9 +709,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: From 7a83608039e469c96d8d7691a86e34919aa787ec Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 22:58:12 +0200 Subject: [PATCH 2/3] Apply suggestions from code review --- docs/source/en/api/pipelines/text_to_video.mdx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index b9ed331da13b..cdeeb9ea3ff9 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -138,7 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt pipe.enable_model_cpu_offload() # memory optimization -self.unet.enable_forward_chunking(chunk_size=1, dim=1) +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) pipe.enable_vae_slicing() prompt = "Darth Vader surfing a wave" @@ -155,7 +155,7 @@ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() # memory optimization -self.unet.enable_forward_chunking(chunk_size=1, dim=1) +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) pipe.enable_vae_slicing() video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] @@ -197,6 +197,9 @@ pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also gives significant memory savings since the two pipelines decode all image frames at once. +```py +pipe.enable_vae_slicing() + ## Available checkpoints * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/) From e51d7b877bbf274e4ac1f2d7baa4cad36dcc1557 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 20:59:28 +0000 Subject: [PATCH 3/3] revert automatic chunking --- docs/source/en/api/pipelines/text_to_video.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index cdeeb9ea3ff9..583d461ea948 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -199,6 +199,7 @@ gives significant memory savings since the two pipelines decode all image frames ```py pipe.enable_vae_slicing() +``` ## Available checkpoints