Spaces:
Runtime error
Runtime error
BestWishYsh
commited on
Update models/pipeline_consisid.py
Browse files- models/pipeline_consisid.py +55 -41
models/pipeline_consisid.py
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
|
15 |
import inspect
|
16 |
import math
|
17 |
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
18 |
|
19 |
import cv2
|
20 |
import numpy as np
|
@@ -24,15 +24,13 @@ from transformers import T5EncoderModel, T5Tokenizer
|
|
24 |
|
25 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
26 |
from diffusers.image_processor import PipelineImageInput
|
|
|
27 |
from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
|
28 |
from diffusers.models.embeddings import get_3d_rotary_pos_embed
|
29 |
from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
|
30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
31 |
-
from diffusers.schedulers import
|
32 |
-
from diffusers.utils import
|
33 |
-
logging,
|
34 |
-
replace_example_docstring,
|
35 |
-
)
|
36 |
from diffusers.utils.torch_utils import randn_tensor
|
37 |
from diffusers.video_processor import VideoProcessor
|
38 |
|
@@ -241,7 +239,7 @@ def retrieve_latents(
|
|
241 |
raise AttributeError("Could not access latents of provided encoder_output")
|
242 |
|
243 |
|
244 |
-
class ConsisIDPipeline(DiffusionPipeline):
|
245 |
r"""
|
246 |
Pipeline for image-to-video generation using ConsisID.
|
247 |
|
@@ -278,8 +276,8 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
278 |
tokenizer: T5Tokenizer,
|
279 |
text_encoder: T5EncoderModel,
|
280 |
vae: AutoencoderKLCogVideoX,
|
281 |
-
transformer:
|
282 |
-
scheduler:
|
283 |
):
|
284 |
super().__init__()
|
285 |
|
@@ -611,21 +609,6 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
611 |
f" {negative_prompt_embeds.shape}."
|
612 |
)
|
613 |
|
614 |
-
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
|
615 |
-
def fuse_qkv_projections(self) -> None:
|
616 |
-
r"""Enables fused QKV projections."""
|
617 |
-
self.fusing_transformer = True
|
618 |
-
self.transformer.fuse_qkv_projections()
|
619 |
-
|
620 |
-
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
|
621 |
-
def unfuse_qkv_projections(self) -> None:
|
622 |
-
r"""Disable QKV projection fusion if enabled."""
|
623 |
-
if not self.fusing_transformer:
|
624 |
-
logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
|
625 |
-
else:
|
626 |
-
self.transformer.unfuse_qkv_projections()
|
627 |
-
self.fusing_transformer = False
|
628 |
-
|
629 |
def _prepare_rotary_positional_embeddings(
|
630 |
self,
|
631 |
height: int,
|
@@ -635,8 +618,8 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
635 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
636 |
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
637 |
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
638 |
-
base_size_width =
|
639 |
-
base_size_height =
|
640 |
|
641 |
grid_crops_coords = get_resize_crop_region_for_grid(
|
642 |
(grid_height, grid_width), base_size_width, base_size_height
|
@@ -646,10 +629,9 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
646 |
crops_coords=grid_crops_coords,
|
647 |
grid_size=(grid_height, grid_width),
|
648 |
temporal_size=num_frames,
|
|
|
649 |
)
|
650 |
|
651 |
-
freqs_cos = freqs_cos.to(device=device)
|
652 |
-
freqs_sin = freqs_sin.to(device=device)
|
653 |
return freqs_cos, freqs_sin
|
654 |
|
655 |
@property
|
@@ -660,6 +642,10 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
660 |
def num_timesteps(self):
|
661 |
return self._num_timesteps
|
662 |
|
|
|
|
|
|
|
|
|
663 |
@property
|
664 |
def interrupt(self):
|
665 |
return self._interrupt
|
@@ -675,8 +661,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
675 |
width: int = 720,
|
676 |
num_frames: int = 49,
|
677 |
num_inference_steps: int = 50,
|
678 |
-
|
679 |
-
guidance_scale: float = 6,
|
680 |
use_dynamic_cfg: bool = False,
|
681 |
num_videos_per_prompt: int = 1,
|
682 |
eta: float = 0.0,
|
@@ -686,6 +671,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
686 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
687 |
output_type: str = "pil",
|
688 |
return_dict: bool = True,
|
|
|
689 |
callback_on_step_end: Optional[
|
690 |
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
691 |
] = None,
|
@@ -720,16 +706,17 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
720 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
721 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
722 |
expense of slower inference.
|
723 |
-
timesteps (`List[int]`, *optional*):
|
724 |
-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
725 |
-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
726 |
-
passed will be used. Must be in descending order.
|
727 |
guidance_scale (`float`, *optional*, defaults to 6):
|
728 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
729 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
730 |
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
731 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
732 |
usually at the expense of lower image quality.
|
|
|
|
|
|
|
|
|
|
|
733 |
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
734 |
The number of videos to generate per prompt.
|
735 |
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
@@ -752,6 +739,10 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
752 |
return_dict (`bool`, *optional*, defaults to `True`):
|
753 |
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
754 |
of a plain tuple.
|
|
|
|
|
|
|
|
|
755 |
callback_on_step_end (`Callable`, *optional*):
|
756 |
A function that calls at the end of each denoising steps during the inference. The function is called
|
757 |
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
@@ -764,6 +755,19 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
764 |
max_sequence_length (`int`, defaults to `226`):
|
765 |
Maximum sequence length in encoded prompt. Must be consistent with
|
766 |
`self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
767 |
|
768 |
Examples:
|
769 |
|
@@ -772,14 +776,14 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
772 |
[`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
|
773 |
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
774 |
"""
|
775 |
-
if num_frames > 49:
|
776 |
-
raise ValueError(
|
777 |
-
"The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
|
778 |
-
)
|
779 |
|
780 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
781 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
782 |
|
|
|
|
|
|
|
|
|
783 |
num_videos_per_prompt = 1
|
784 |
|
785 |
# 1. Check inputs. Raise error if not correct
|
@@ -795,6 +799,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
795 |
negative_prompt_embeds=negative_prompt_embeds,
|
796 |
)
|
797 |
self._guidance_scale = guidance_scale
|
|
|
798 |
self._interrupt = False
|
799 |
|
800 |
# 2. Default call parameters
|
@@ -827,7 +832,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
827 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
828 |
|
829 |
# 4. Prepare timesteps
|
830 |
-
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device
|
831 |
self._num_timesteps = len(timesteps)
|
832 |
|
833 |
# 5. Prepare latents
|
@@ -874,6 +879,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
874 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
875 |
# for DPM-solver++
|
876 |
old_pred_original_sample = None
|
|
|
877 |
for i, t in enumerate(timesteps):
|
878 |
if self.interrupt:
|
879 |
continue
|
@@ -893,6 +899,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
893 |
encoder_hidden_states=prompt_embeds,
|
894 |
timestep=timestep,
|
895 |
image_rotary_emb=image_rotary_emb,
|
|
|
896 |
return_dict=False,
|
897 |
id_vit_hidden=id_vit_hidden,
|
898 |
id_cond=id_cond,
|
@@ -902,7 +909,14 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
902 |
# perform guidance
|
903 |
if use_dynamic_cfg:
|
904 |
self._guidance_scale = 1 + guidance_scale * (
|
905 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
906 |
)
|
907 |
if do_classifier_free_guidance:
|
908 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
@@ -949,4 +963,4 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
949 |
if not return_dict:
|
950 |
return (video,)
|
951 |
|
952 |
-
return ConsisIDPipelineOutput(frames=video)
|
|
|
14 |
|
15 |
import inspect
|
16 |
import math
|
17 |
+
from typing import Callable, Any, Dict, List, Optional, Tuple, Union
|
18 |
|
19 |
import cv2
|
20 |
import numpy as np
|
|
|
24 |
|
25 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
26 |
from diffusers.image_processor import PipelineImageInput
|
27 |
+
from diffusers.loaders import CogVideoXLoraLoaderMixin
|
28 |
from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
|
29 |
from diffusers.models.embeddings import get_3d_rotary_pos_embed
|
30 |
from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
|
31 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
32 |
+
from diffusers.schedulers import CogVideoXDPMScheduler
|
33 |
+
from diffusers.utils import logging, replace_example_docstring
|
|
|
|
|
|
|
34 |
from diffusers.utils.torch_utils import randn_tensor
|
35 |
from diffusers.video_processor import VideoProcessor
|
36 |
|
|
|
239 |
raise AttributeError("Could not access latents of provided encoder_output")
|
240 |
|
241 |
|
242 |
+
class ConsisIDPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
243 |
r"""
|
244 |
Pipeline for image-to-video generation using ConsisID.
|
245 |
|
|
|
276 |
tokenizer: T5Tokenizer,
|
277 |
text_encoder: T5EncoderModel,
|
278 |
vae: AutoencoderKLCogVideoX,
|
279 |
+
transformer: ConsisIDTransformer3DModel,
|
280 |
+
scheduler: CogVideoXDPMScheduler,
|
281 |
):
|
282 |
super().__init__()
|
283 |
|
|
|
609 |
f" {negative_prompt_embeds.shape}."
|
610 |
)
|
611 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
612 |
def _prepare_rotary_positional_embeddings(
|
613 |
self,
|
614 |
height: int,
|
|
|
618 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
619 |
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
620 |
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
621 |
+
base_size_width = self.transformer.config.sample_width // self.transformer.config.patch_size
|
622 |
+
base_size_height = self.transformer.config.sample_height // self.transformer.config.patch_size
|
623 |
|
624 |
grid_crops_coords = get_resize_crop_region_for_grid(
|
625 |
(grid_height, grid_width), base_size_width, base_size_height
|
|
|
629 |
crops_coords=grid_crops_coords,
|
630 |
grid_size=(grid_height, grid_width),
|
631 |
temporal_size=num_frames,
|
632 |
+
device=device,
|
633 |
)
|
634 |
|
|
|
|
|
635 |
return freqs_cos, freqs_sin
|
636 |
|
637 |
@property
|
|
|
642 |
def num_timesteps(self):
|
643 |
return self._num_timesteps
|
644 |
|
645 |
+
@property
|
646 |
+
def attention_kwargs(self):
|
647 |
+
return self._attention_kwargs
|
648 |
+
|
649 |
@property
|
650 |
def interrupt(self):
|
651 |
return self._interrupt
|
|
|
661 |
width: int = 720,
|
662 |
num_frames: int = 49,
|
663 |
num_inference_steps: int = 50,
|
664 |
+
guidance_scale: float = 6.0,
|
|
|
665 |
use_dynamic_cfg: bool = False,
|
666 |
num_videos_per_prompt: int = 1,
|
667 |
eta: float = 0.0,
|
|
|
671 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
672 |
output_type: str = "pil",
|
673 |
return_dict: bool = True,
|
674 |
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
675 |
callback_on_step_end: Optional[
|
676 |
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
677 |
] = None,
|
|
|
706 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
707 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
708 |
expense of slower inference.
|
|
|
|
|
|
|
|
|
709 |
guidance_scale (`float`, *optional*, defaults to 6):
|
710 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
711 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
712 |
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
713 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
714 |
usually at the expense of lower image quality.
|
715 |
+
use_dynamic_cfg (`bool`, *optional*, defaults to `False`):
|
716 |
+
If True, dynamically adjusts the guidance scale during inference. This allows the model to use a
|
717 |
+
progressive guidance scale, improving the balance between text-guided generation and image quality over
|
718 |
+
the course of the inference steps. Typically, early inference steps use a higher guidance scale for
|
719 |
+
more faithful image generation, while later steps reduce it for more diverse and natural results.
|
720 |
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
721 |
The number of videos to generate per prompt.
|
722 |
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
|
|
739 |
return_dict (`bool`, *optional*, defaults to `True`):
|
740 |
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
741 |
of a plain tuple.
|
742 |
+
attention_kwargs (`dict`, *optional*):
|
743 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
744 |
+
`self.processor` in
|
745 |
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
746 |
callback_on_step_end (`Callable`, *optional*):
|
747 |
A function that calls at the end of each denoising steps during the inference. The function is called
|
748 |
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
|
|
755 |
max_sequence_length (`int`, defaults to `226`):
|
756 |
Maximum sequence length in encoded prompt. Must be consistent with
|
757 |
`self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
|
758 |
+
id_vit_hidden (`Optional[torch.Tensor]`, *optional*):
|
759 |
+
The tensor representing the hidden features extracted from the face model, which are used to condition
|
760 |
+
the local facial extractor. This is crucial for the model to obtain high-frequency information of the
|
761 |
+
face. If not provided, the local facial extractor will not run normally.
|
762 |
+
id_cond (`Optional[torch.Tensor]`, *optional*):
|
763 |
+
The tensor representing the hidden features extracted from the clip model, which are used to condition
|
764 |
+
the local facial extractor. This is crucial for the model to edit facial features If not provided, the
|
765 |
+
local facial extractor will not run normally.
|
766 |
+
kps_cond (`Optional[torch.Tensor]`, *optional*):
|
767 |
+
A tensor that determines whether the global facial extractor use keypoint information for conditioning.
|
768 |
+
If provided, this tensor controls whether facial keypoints such as eyes, nose, and mouth landmarks are
|
769 |
+
used during the generation process. This helps ensure the model retains more facial low-frequency
|
770 |
+
information.
|
771 |
|
772 |
Examples:
|
773 |
|
|
|
776 |
[`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
|
777 |
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
778 |
"""
|
|
|
|
|
|
|
|
|
779 |
|
780 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
781 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
782 |
|
783 |
+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
|
784 |
+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
|
785 |
+
num_frames = num_frames or self.transformer.config.sample_frames
|
786 |
+
|
787 |
num_videos_per_prompt = 1
|
788 |
|
789 |
# 1. Check inputs. Raise error if not correct
|
|
|
799 |
negative_prompt_embeds=negative_prompt_embeds,
|
800 |
)
|
801 |
self._guidance_scale = guidance_scale
|
802 |
+
self._attention_kwargs = attention_kwargs
|
803 |
self._interrupt = False
|
804 |
|
805 |
# 2. Default call parameters
|
|
|
832 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
833 |
|
834 |
# 4. Prepare timesteps
|
835 |
+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
|
836 |
self._num_timesteps = len(timesteps)
|
837 |
|
838 |
# 5. Prepare latents
|
|
|
879 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
880 |
# for DPM-solver++
|
881 |
old_pred_original_sample = None
|
882 |
+
timesteps_cpu = timesteps.cpu()
|
883 |
for i, t in enumerate(timesteps):
|
884 |
if self.interrupt:
|
885 |
continue
|
|
|
899 |
encoder_hidden_states=prompt_embeds,
|
900 |
timestep=timestep,
|
901 |
image_rotary_emb=image_rotary_emb,
|
902 |
+
attention_kwargs=attention_kwargs,
|
903 |
return_dict=False,
|
904 |
id_vit_hidden=id_vit_hidden,
|
905 |
id_cond=id_cond,
|
|
|
909 |
# perform guidance
|
910 |
if use_dynamic_cfg:
|
911 |
self._guidance_scale = 1 + guidance_scale * (
|
912 |
+
(
|
913 |
+
1
|
914 |
+
- math.cos(
|
915 |
+
math.pi
|
916 |
+
* ((num_inference_steps - timesteps_cpu[i].item()) / num_inference_steps) ** 5.0
|
917 |
+
)
|
918 |
+
)
|
919 |
+
/ 2
|
920 |
)
|
921 |
if do_classifier_free_guidance:
|
922 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
|
963 |
if not return_dict:
|
964 |
return (video,)
|
965 |
|
966 |
+
return ConsisIDPipelineOutput(frames=video)
|