Skip to content

Commit 7e2cc71

Browse files
Add video img2img (huggingface#3900)
* Add image to image video * Improve * better naming * make fix copies * add docs * finish tests * trigger tests * make style * correct * finish * Fix more * make style * finish
1 parent 9c48419 commit 7e2cc71

7 files changed

+798
-3
lines changed

__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@
173173
VersatileDiffusionImageVariationPipeline,
174174
VersatileDiffusionPipeline,
175175
VersatileDiffusionTextToImagePipeline,
176+
VideoToVideoSDPipeline,
176177
VQDiffusionPipeline,
177178
)
178179

models/autoencoder_kl.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,12 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderK
229229
if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
230230
return self.tiled_encode(x, return_dict=return_dict)
231231

232-
h = self.encoder(x)
232+
if self.use_slicing and x.shape[0] > 1:
233+
encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
234+
h = torch.cat(encoded_slices)
235+
else:
236+
h = self.encoder(x)
237+
233238
moments = self.quant_conv(h)
234239
posterior = DiagonalGaussianDistribution(moments)
235240

pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
StableUnCLIPPipeline,
9090
)
9191
from .stable_diffusion_safe import StableDiffusionPipelineSafe
92-
from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
92+
from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline, VideoToVideoSDPipeline
9393
from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
9494
from .unidiffuser import ImageTextPipelineOutput, UniDiffuserModel, UniDiffuserPipeline, UniDiffuserTextDecoder
9595
from .versatile_diffusion import (

pipelines/text_to_video_synthesis/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@ class TextToVideoSDPipelineOutput(BaseOutput):
2828
except OptionalDependencyNotAvailable:
2929
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
3030
else:
31-
from .pipeline_text_to_video_synth import TextToVideoSDPipeline # noqa: F401
31+
from .pipeline_text_to_video_synth import TextToVideoSDPipeline
32+
from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline # noqa: F401
3233
from .pipeline_text_to_video_zero import TextToVideoZeroPipeline

pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,9 @@ def __call__(
672672
if callback is not None and i % callback_steps == 0:
673673
callback(i, t, latents)
674674

675+
if output_type == "latent":
676+
return TextToVideoSDPipelineOutput(frames=latents)
677+
675678
video_tensor = self.decode_latents(latents)
676679

677680
if output_type == "pt":

0 commit comments

Comments
 (0)