BestWishYsh commited on
Commit
6c2beee
·
verified ·
1 Parent(s): 0cf7f04

Update models/pipeline_consisid.py

Browse files
Files changed (1) hide show
  1. models/pipeline_consisid.py +55 -41
models/pipeline_consisid.py CHANGED
@@ -14,7 +14,7 @@
14
 
15
  import inspect
16
  import math
17
- from typing import Callable, Dict, List, Optional, Tuple, Union
18
 
19
  import cv2
20
  import numpy as np
@@ -24,15 +24,13 @@ from transformers import T5EncoderModel, T5Tokenizer
24
 
25
  from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
26
  from diffusers.image_processor import PipelineImageInput
 
27
  from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
28
  from diffusers.models.embeddings import get_3d_rotary_pos_embed
29
  from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline
31
- from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
32
- from diffusers.utils import (
33
- logging,
34
- replace_example_docstring,
35
- )
36
  from diffusers.utils.torch_utils import randn_tensor
37
  from diffusers.video_processor import VideoProcessor
38
 
@@ -241,7 +239,7 @@ def retrieve_latents(
241
  raise AttributeError("Could not access latents of provided encoder_output")
242
 
243
 
244
- class ConsisIDPipeline(DiffusionPipeline):
245
  r"""
246
  Pipeline for image-to-video generation using ConsisID.
247
 
@@ -278,8 +276,8 @@ class ConsisIDPipeline(DiffusionPipeline):
278
  tokenizer: T5Tokenizer,
279
  text_encoder: T5EncoderModel,
280
  vae: AutoencoderKLCogVideoX,
281
- transformer: Union[ConsisIDTransformer3DModel],
282
- scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
283
  ):
284
  super().__init__()
285
 
@@ -611,21 +609,6 @@ class ConsisIDPipeline(DiffusionPipeline):
611
  f" {negative_prompt_embeds.shape}."
612
  )
613
 
614
- # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
615
- def fuse_qkv_projections(self) -> None:
616
- r"""Enables fused QKV projections."""
617
- self.fusing_transformer = True
618
- self.transformer.fuse_qkv_projections()
619
-
620
- # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
621
- def unfuse_qkv_projections(self) -> None:
622
- r"""Disable QKV projection fusion if enabled."""
623
- if not self.fusing_transformer:
624
- logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
625
- else:
626
- self.transformer.unfuse_qkv_projections()
627
- self.fusing_transformer = False
628
-
629
  def _prepare_rotary_positional_embeddings(
630
  self,
631
  height: int,
@@ -635,8 +618,8 @@ class ConsisIDPipeline(DiffusionPipeline):
635
  ) -> Tuple[torch.Tensor, torch.Tensor]:
636
  grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
637
  grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
638
- base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
639
- base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
640
 
641
  grid_crops_coords = get_resize_crop_region_for_grid(
642
  (grid_height, grid_width), base_size_width, base_size_height
@@ -646,10 +629,9 @@ class ConsisIDPipeline(DiffusionPipeline):
646
  crops_coords=grid_crops_coords,
647
  grid_size=(grid_height, grid_width),
648
  temporal_size=num_frames,
 
649
  )
650
 
651
- freqs_cos = freqs_cos.to(device=device)
652
- freqs_sin = freqs_sin.to(device=device)
653
  return freqs_cos, freqs_sin
654
 
655
  @property
@@ -660,6 +642,10 @@ class ConsisIDPipeline(DiffusionPipeline):
660
  def num_timesteps(self):
661
  return self._num_timesteps
662
 
 
 
 
 
663
  @property
664
  def interrupt(self):
665
  return self._interrupt
@@ -675,8 +661,7 @@ class ConsisIDPipeline(DiffusionPipeline):
675
  width: int = 720,
676
  num_frames: int = 49,
677
  num_inference_steps: int = 50,
678
- timesteps: Optional[List[int]] = None,
679
- guidance_scale: float = 6,
680
  use_dynamic_cfg: bool = False,
681
  num_videos_per_prompt: int = 1,
682
  eta: float = 0.0,
@@ -686,6 +671,7 @@ class ConsisIDPipeline(DiffusionPipeline):
686
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
687
  output_type: str = "pil",
688
  return_dict: bool = True,
 
689
  callback_on_step_end: Optional[
690
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
691
  ] = None,
@@ -720,16 +706,17 @@ class ConsisIDPipeline(DiffusionPipeline):
720
  num_inference_steps (`int`, *optional*, defaults to 50):
721
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
722
  expense of slower inference.
723
- timesteps (`List[int]`, *optional*):
724
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
725
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
726
- passed will be used. Must be in descending order.
727
  guidance_scale (`float`, *optional*, defaults to 6):
728
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
729
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
730
  Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
731
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
732
  usually at the expense of lower image quality.
 
 
 
 
 
733
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
734
  The number of videos to generate per prompt.
735
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -752,6 +739,10 @@ class ConsisIDPipeline(DiffusionPipeline):
752
  return_dict (`bool`, *optional*, defaults to `True`):
753
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
754
  of a plain tuple.
 
 
 
 
755
  callback_on_step_end (`Callable`, *optional*):
756
  A function that calls at the end of each denoising steps during the inference. The function is called
757
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -764,6 +755,19 @@ class ConsisIDPipeline(DiffusionPipeline):
764
  max_sequence_length (`int`, defaults to `226`):
765
  Maximum sequence length in encoded prompt. Must be consistent with
766
  `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
 
 
 
 
 
 
 
 
 
 
 
 
 
767
 
768
  Examples:
769
 
@@ -772,14 +776,14 @@ class ConsisIDPipeline(DiffusionPipeline):
772
  [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
773
  `tuple`. When returning a tuple, the first element is a list with the generated images.
774
  """
775
- if num_frames > 49:
776
- raise ValueError(
777
- "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
778
- )
779
 
780
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
781
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
782
 
 
 
 
 
783
  num_videos_per_prompt = 1
784
 
785
  # 1. Check inputs. Raise error if not correct
@@ -795,6 +799,7 @@ class ConsisIDPipeline(DiffusionPipeline):
795
  negative_prompt_embeds=negative_prompt_embeds,
796
  )
797
  self._guidance_scale = guidance_scale
 
798
  self._interrupt = False
799
 
800
  # 2. Default call parameters
@@ -827,7 +832,7 @@ class ConsisIDPipeline(DiffusionPipeline):
827
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
828
 
829
  # 4. Prepare timesteps
830
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
831
  self._num_timesteps = len(timesteps)
832
 
833
  # 5. Prepare latents
@@ -874,6 +879,7 @@ class ConsisIDPipeline(DiffusionPipeline):
874
  with self.progress_bar(total=num_inference_steps) as progress_bar:
875
  # for DPM-solver++
876
  old_pred_original_sample = None
 
877
  for i, t in enumerate(timesteps):
878
  if self.interrupt:
879
  continue
@@ -893,6 +899,7 @@ class ConsisIDPipeline(DiffusionPipeline):
893
  encoder_hidden_states=prompt_embeds,
894
  timestep=timestep,
895
  image_rotary_emb=image_rotary_emb,
 
896
  return_dict=False,
897
  id_vit_hidden=id_vit_hidden,
898
  id_cond=id_cond,
@@ -902,7 +909,14 @@ class ConsisIDPipeline(DiffusionPipeline):
902
  # perform guidance
903
  if use_dynamic_cfg:
904
  self._guidance_scale = 1 + guidance_scale * (
905
- (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
 
 
 
 
 
 
 
906
  )
907
  if do_classifier_free_guidance:
908
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
@@ -949,4 +963,4 @@ class ConsisIDPipeline(DiffusionPipeline):
949
  if not return_dict:
950
  return (video,)
951
 
952
- return ConsisIDPipelineOutput(frames=video)
 
14
 
15
  import inspect
16
  import math
17
+ from typing import Callable, Any, Dict, List, Optional, Tuple, Union
18
 
19
  import cv2
20
  import numpy as np
 
24
 
25
  from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
26
  from diffusers.image_processor import PipelineImageInput
27
+ from diffusers.loaders import CogVideoXLoraLoaderMixin
28
  from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
29
  from diffusers.models.embeddings import get_3d_rotary_pos_embed
30
  from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
31
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline
32
+ from diffusers.schedulers import CogVideoXDPMScheduler
33
+ from diffusers.utils import logging, replace_example_docstring
 
 
 
34
  from diffusers.utils.torch_utils import randn_tensor
35
  from diffusers.video_processor import VideoProcessor
36
 
 
239
  raise AttributeError("Could not access latents of provided encoder_output")
240
 
241
 
242
+ class ConsisIDPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
243
  r"""
244
  Pipeline for image-to-video generation using ConsisID.
245
 
 
276
  tokenizer: T5Tokenizer,
277
  text_encoder: T5EncoderModel,
278
  vae: AutoencoderKLCogVideoX,
279
+ transformer: ConsisIDTransformer3DModel,
280
+ scheduler: CogVideoXDPMScheduler,
281
  ):
282
  super().__init__()
283
 
 
609
  f" {negative_prompt_embeds.shape}."
610
  )
611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  def _prepare_rotary_positional_embeddings(
613
  self,
614
  height: int,
 
618
  ) -> Tuple[torch.Tensor, torch.Tensor]:
619
  grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
620
  grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
621
+ base_size_width = self.transformer.config.sample_width // self.transformer.config.patch_size
622
+ base_size_height = self.transformer.config.sample_height // self.transformer.config.patch_size
623
 
624
  grid_crops_coords = get_resize_crop_region_for_grid(
625
  (grid_height, grid_width), base_size_width, base_size_height
 
629
  crops_coords=grid_crops_coords,
630
  grid_size=(grid_height, grid_width),
631
  temporal_size=num_frames,
632
+ device=device,
633
  )
634
 
 
 
635
  return freqs_cos, freqs_sin
636
 
637
  @property
 
642
  def num_timesteps(self):
643
  return self._num_timesteps
644
 
645
+ @property
646
+ def attention_kwargs(self):
647
+ return self._attention_kwargs
648
+
649
  @property
650
  def interrupt(self):
651
  return self._interrupt
 
661
  width: int = 720,
662
  num_frames: int = 49,
663
  num_inference_steps: int = 50,
664
+ guidance_scale: float = 6.0,
 
665
  use_dynamic_cfg: bool = False,
666
  num_videos_per_prompt: int = 1,
667
  eta: float = 0.0,
 
671
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
672
  output_type: str = "pil",
673
  return_dict: bool = True,
674
+ attention_kwargs: Optional[Dict[str, Any]] = None,
675
  callback_on_step_end: Optional[
676
  Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
677
  ] = None,
 
706
  num_inference_steps (`int`, *optional*, defaults to 50):
707
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
708
  expense of slower inference.
 
 
 
 
709
  guidance_scale (`float`, *optional*, defaults to 6):
710
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
711
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
712
  Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
713
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
714
  usually at the expense of lower image quality.
715
+ use_dynamic_cfg (`bool`, *optional*, defaults to `False`):
716
+ If True, dynamically adjusts the guidance scale during inference. This allows the model to use a
717
+ progressive guidance scale, improving the balance between text-guided generation and image quality over
718
+ the course of the inference steps. Typically, early inference steps use a higher guidance scale for
719
+ more faithful image generation, while later steps reduce it for more diverse and natural results.
720
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
721
  The number of videos to generate per prompt.
722
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
 
739
  return_dict (`bool`, *optional*, defaults to `True`):
740
  Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
741
  of a plain tuple.
742
+ attention_kwargs (`dict`, *optional*):
743
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
744
+ `self.processor` in
745
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
746
  callback_on_step_end (`Callable`, *optional*):
747
  A function that calls at the end of each denoising steps during the inference. The function is called
748
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
 
755
  max_sequence_length (`int`, defaults to `226`):
756
  Maximum sequence length in encoded prompt. Must be consistent with
757
  `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
758
+ id_vit_hidden (`Optional[torch.Tensor]`, *optional*):
759
+ The tensor representing the hidden features extracted from the face model, which are used to condition
760
+ the local facial extractor. This is crucial for the model to obtain high-frequency information of the
761
+ face. If not provided, the local facial extractor will not run normally.
762
+ id_cond (`Optional[torch.Tensor]`, *optional*):
763
+ The tensor representing the hidden features extracted from the clip model, which are used to condition
764
+ the local facial extractor. This is crucial for the model to edit facial features If not provided, the
765
+ local facial extractor will not run normally.
766
+ kps_cond (`Optional[torch.Tensor]`, *optional*):
767
+ A tensor that determines whether the global facial extractor use keypoint information for conditioning.
768
+ If provided, this tensor controls whether facial keypoints such as eyes, nose, and mouth landmarks are
769
+ used during the generation process. This helps ensure the model retains more facial low-frequency
770
+ information.
771
 
772
  Examples:
773
 
 
776
  [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
777
  `tuple`. When returning a tuple, the first element is a list with the generated images.
778
  """
 
 
 
 
779
 
780
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
781
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
782
 
783
+ height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
784
+ width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
785
+ num_frames = num_frames or self.transformer.config.sample_frames
786
+
787
  num_videos_per_prompt = 1
788
 
789
  # 1. Check inputs. Raise error if not correct
 
799
  negative_prompt_embeds=negative_prompt_embeds,
800
  )
801
  self._guidance_scale = guidance_scale
802
+ self._attention_kwargs = attention_kwargs
803
  self._interrupt = False
804
 
805
  # 2. Default call parameters
 
832
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
833
 
834
  # 4. Prepare timesteps
835
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
836
  self._num_timesteps = len(timesteps)
837
 
838
  # 5. Prepare latents
 
879
  with self.progress_bar(total=num_inference_steps) as progress_bar:
880
  # for DPM-solver++
881
  old_pred_original_sample = None
882
+ timesteps_cpu = timesteps.cpu()
883
  for i, t in enumerate(timesteps):
884
  if self.interrupt:
885
  continue
 
899
  encoder_hidden_states=prompt_embeds,
900
  timestep=timestep,
901
  image_rotary_emb=image_rotary_emb,
902
+ attention_kwargs=attention_kwargs,
903
  return_dict=False,
904
  id_vit_hidden=id_vit_hidden,
905
  id_cond=id_cond,
 
909
  # perform guidance
910
  if use_dynamic_cfg:
911
  self._guidance_scale = 1 + guidance_scale * (
912
+ (
913
+ 1
914
+ - math.cos(
915
+ math.pi
916
+ * ((num_inference_steps - timesteps_cpu[i].item()) / num_inference_steps) ** 5.0
917
+ )
918
+ )
919
+ / 2
920
  )
921
  if do_classifier_free_guidance:
922
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
 
963
  if not return_dict:
964
  return (video,)
965
 
966
+ return ConsisIDPipelineOutput(frames=video)