diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index 75868d7dd6ea..583d461ea948 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt pipe.enable_model_cpu_offload() # memory optimization +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) pipe.enable_vae_slicing() prompt = "Darth Vader surfing a wave" @@ -150,10 +151,13 @@ Now the video can be upscaled: ```py pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) -pipe.vae.enable_slicing() pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() +# memory optimization +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) +pipe.enable_vae_slicing() + video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] video_frames = pipe(prompt, video=video, strength=0.6).frames @@ -175,6 +179,28 @@ Here are some sample outputs: +### Memory optimizations + +Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both +when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce +memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable +**forward chunking** and **vae slicing**: + +Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and +allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames` +dimension by doing: + +```py +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) +``` + +Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also +gives significant memory savings since the two pipelines decode all image frames at once. + +```py +pipe.enable_vae_slicing() +``` + ## Available checkpoints * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 680a524732e9..e30f183808a5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -634,9 +634,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 1b6cd9c2b392..ce5109a58213 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -709,9 +709,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: