Spaces:

BestWishYsh
/

ConsisID-preview-Space

Runtime error

App Files Files Community

BestWishYsh commited on 17 days ago

Commit

5ef7e81

verified ·

1 Parent(s): 6c2beee

Update models/transformer_consisid.py

Browse files

Files changed (1) hide show

models/transformer_consisid.py +81 -227

models/transformer_consisid.py CHANGED Viewed

@@ -16,7 +16,7 @@ import glob
 import json
 import math
 import os
-from typing import Any, Dict, Optional, Tuple, Union
 import torch
 from torch import nn
@@ -24,11 +24,7 @@ from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.loaders import PeftAdapterMixin
 from diffusers.models.attention import Attention, FeedForward
-from diffusers.models.attention_processor import (
-    AttentionProcessor,
-    CogVideoXAttnProcessor2_0,
-    FusedCogVideoXAttnProcessor2_0,
-)
 from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
@@ -40,61 +36,10 @@ from diffusers.utils.torch_utils import maybe_allow_in_graph
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def ConsisIDFeedForward(dim, mult=4):
-    """
-    Creates a consistent ID feedforward block consisting of layer normalization, two linear layers, and a GELU
-    activation.
-    Args:
-        dim (int): The input dimension of the tensor.
-        mult (int, optional): Multiplier for the inner dimension. Default is 4.
-    Returns:
-        nn.Sequential: A sequence of layers comprising LayerNorm, Linear layers, and GELU.
-    """
-    inner_dim = int(dim * mult)
-    return nn.Sequential(
-        nn.LayerNorm(dim),
-        nn.Linear(dim, inner_dim, bias=False),
-        nn.GELU(),
-        nn.Linear(inner_dim, dim, bias=False),
-    )
-def reshape_tensor(x, heads):
-    """
-    Reshapes the input tensor for multi-head attention.
-    Args:
-        x (torch.Tensor): The input tensor with shape (batch_size, length, width).
-        heads (int): The number of attention heads.
-    Returns:
-        torch.Tensor: The reshaped tensor, with shape (batch_size, heads, length, width).
-    """
-    bs, length, width = x.shape
-    x = x.view(bs, length, heads, -1)
-    x = x.transpose(1, 2)
-    x = x.reshape(bs, heads, length, -1)
-    return x
 class PerceiverAttention(nn.Module):
-    """
-    Implements the Perceiver attention mechanism with multi-head attention.
-    This layer takes two inputs: 'x' (image features) and 'latents' (latent features), applying multi-head attention to
-    both and producing an output tensor with the same dimension as the input tensor 'x'.
-    Args:
-        dim (int): The input dimension.
-        dim_head (int, optional): The dimension of each attention head. Default is 64.
-        heads (int, optional): The number of attention heads. Default is 8.
-        kv_dim (int, optional): The key-value dimension. If None, `dim` is used for both keys and values.
-    """
-    def __init__(self, *, dim, dim_head=64, heads=8, kv_dim=None):
         super().__init__()
         self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
@@ -107,80 +52,58 @@ class PerceiverAttention(nn.Module):
         self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
-    def forward(self, x, latents):
-        """
-        Forward pass for Perceiver attention.
-        Args:
-            x (torch.Tensor): Image features tensor with shape (batch_size, num_pixels, D).
-            latents (torch.Tensor): Latent features tensor with shape (batch_size, num_latents, D).
-        Returns:
-            torch.Tensor: Output tensor after applying attention and transformation.
-        """
         # Apply normalization
-        x = self.norm1(x)
         latents = self.norm2(latents)
-        b, seq_len, _ = latents.shape  # Get batch size and sequence length
         # Compute query, key, and value matrices
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
         # Reshape the tensors for multi-head attention
-        q = reshape_tensor(q, self.heads)
-        k = reshape_tensor(k, self.heads)
-        v = reshape_tensor(v, self.heads)
         # attention
         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
-        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        out = weight @ v
         # Reshape and return the final output
-        out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
-        return self.to_out(out)
 class LocalFacialExtractor(nn.Module):
     def __init__(
         self,
-        id_dim=1280,
-        vit_dim=1024,
-        depth=10,
-        dim_head=64,
-        heads=16,
-        num_id_token=5,
-        num_queries=32,
-        output_dim=2048,
-        ff_mult=4,
     ):
-        """
-        Initializes the LocalFacialExtractor class.
-        Parameters:
-        - id_dim (int): The dimensionality of id features.
-        - vit_dim (int): The dimensionality of vit features.
-        - depth (int): Total number of PerceiverAttention and ConsisIDFeedForward layers.
-        - dim_head (int): Dimensionality of each attention head.
-        - heads (int): Number of attention heads.
-        - num_id_token (int): Number of tokens used for identity features.
-        - num_queries (int): Number of query tokens for the latent representation.
-        - output_dim (int): Output dimension after projection.
-        - ff_mult (int): Multiplier for the feed-forward network hidden dimension.
-        """
         super().__init__()
         # Storing identity token and query information
         self.num_id_token = num_id_token
         self.vit_dim = vit_dim
         self.num_queries = num_queries
-        assert depth % 5 == 0
-        self.depth = depth // 5
         scale = vit_dim**-0.5
         # Learnable latent query embeddings
@@ -195,13 +118,18 @@ class LocalFacialExtractor(nn.Module):
                 nn.ModuleList(
                     [
                         PerceiverAttention(dim=vit_dim, dim_head=dim_head, heads=heads),  # Perceiver Attention layer
-                        ConsisIDFeedForward(dim=vit_dim, mult=ff_mult),  # ConsisIDFeedForward layer
                     ]
                 )
             )
         # Mappings for each of the 5 different ViT features
-        for i in range(5):
             setattr(
                 self,
                 f"mapping_{i}",
@@ -227,32 +155,21 @@ class LocalFacialExtractor(nn.Module):
             nn.Linear(vit_dim, vit_dim * num_id_token),
         )
-    def forward(self, x, y):
-        """
-        Forward pass for LocalFacialExtractor.
-        Parameters:
-        - x (Tensor): The input identity embedding tensor of shape (batch_size, id_dim).
-        - y (list of Tensor): A list of 5 visual feature tensors each of shape (batch_size, vit_dim).
-        Returns:
-        - Tensor: The extracted latent features of shape (batch_size, num_queries, output_dim).
-        """
         # Repeat latent queries for the batch size
-        latents = self.latents.repeat(x.size(0), 1, 1)
         # Map the identity embedding to tokens
-        x = self.id_embedding_mapping(x)
-        x = x.reshape(-1, self.num_id_token, self.vit_dim)
         # Concatenate identity tokens with the latent queries
-        latents = torch.cat((latents, x), dim=1)
-        # Process each of the 5 visual feature inputs
-        for i in range(5):
-            vit_feature = getattr(self, f"mapping_{i}")(y[i])
-            ctx_feature = torch.cat((x, vit_feature), dim=1)
             # Pass through the PerceiverAttention and ConsisIDFeedForward layers
             for attn, ff in self.layers[i * self.depth : (i + 1) * self.depth]:
@@ -267,26 +184,9 @@ class LocalFacialExtractor(nn.Module):
 class PerceiverCrossAttention(nn.Module):
-    """
-    Args:
-        dim (int): Dimension of the input latent and output. Default is 3072.
-        dim_head (int): Dimension of each attention head. Default is 128.
-        heads (int): Number of attention heads. Default is 16.
-        kv_dim (int): Dimension of the key/value input, allowing flexible cross-attention. Default is 2048.
-    Attributes:
-        scale (float): Scaling factor used in dot-product attention for numerical stability.
-        norm1 (nn.LayerNorm): Layer normalization applied to the input image features.
-        norm2 (nn.LayerNorm): Layer normalization applied to the latent features.
-        to_q (nn.Linear): Linear layer for projecting the latent features into queries.
-        to_kv (nn.Linear): Linear layer for projecting the input features into keys and values.
-        to_out (nn.Linear): Linear layer for outputting the final result after attention.
-    """
-    def __init__(self, *, dim=3072, dim_head=128, heads=16, kv_dim=2048):
         super().__init__()
         self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
@@ -301,47 +201,32 @@ class PerceiverCrossAttention(nn.Module):
         self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
-    def forward(self, x, latents):
-        """
-        Args:
-            x (torch.Tensor): Input image features with shape (batch_size, n1, D), where:
-                - batch_size (b): Number of samples in the batch.
-                - n1: Sequence length (e.g., number of patches or tokens).
-                - D: Feature dimension.
-            latents (torch.Tensor): Latent feature representations with shape (batch_size, n2, D), where:
-                - n2: Number of latent elements.
-        Returns:
-            torch.Tensor: Attention-modulated features with shape (batch_size, n2, D).
-        """
         # Apply layer normalization to the input image and latent features
-        x = self.norm1(x)
-        latents = self.norm2(latents)
-        b, seq_len, _ = latents.shape
         # Compute queries, keys, and values
-        q = self.to_q(latents)
-        k, v = self.to_kv(x).chunk(2, dim=-1)
         # Reshape tensors to split into attention heads
-        q = reshape_tensor(q, self.heads)
-        k = reshape_tensor(k, self.heads)
-        v = reshape_tensor(v, self.heads)
         # Compute attention weights
         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
-        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable scaling than post-division
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         # Compute the output via weighted combination of values
-        out = weight @ v
         # Reshape and permute to prepare for final linear transformation
-        out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
         return self.to_out(out)
@@ -567,6 +452,9 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             The multiplication factor applied to the feed-forward network's hidden layer size in the Local Facial
             Extractor (LFE). A higher value increases the model's capacity to learn more complex facial feature
             transformations, but also increases the computation and memory requirements.
         local_face_scale (`float`, defaults to `1.0`):
             A scaling factor used to adjust the importance of local facial features in the model. This can influence
             how strongly the model focuses on high frequency face-related content.
@@ -616,6 +504,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         LFE_num_querie: int = 32,
         LFE_output_dim: int = 2048,
         LFE_ff_mult: int = 4,
         local_face_scale: float = 1.0,
     ):
         super().__init__()
@@ -680,8 +569,6 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         )
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
-        self.gradient_checkpointing = False
         self.is_train_face = is_train_face
         self.is_kps = is_kps
@@ -697,6 +584,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             self.LFE_num_querie = LFE_num_querie
             self.LFE_output_dim = LFE_output_dim
             self.LFE_ff_mult = LFE_ff_mult
             # cross configs
             self.inner_dim = inner_dim
             self.cross_attn_interval = cross_attn_interval
@@ -708,6 +596,8 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             # face modules
             self._init_face_inputs()
     def _set_gradient_checkpointing(self, module, value=False):
         self.gradient_checkpointing = value
@@ -724,8 +614,8 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             num_queries=self.LFE_num_querie,
             output_dim=self.LFE_output_dim,
             ff_mult=self.LFE_ff_mult,
-        )
-        self.local_facial_extractor.to(device, dtype=weight_dtype)
         self.perceiver_cross_attention = nn.ModuleList(
             [
                 PerceiverCrossAttention(
@@ -811,46 +701,6 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
-    def fuse_qkv_projections(self):
-        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        are fused. For cross-attention modules, key and value projection matrices are fused.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        self.original_attn_processors = None
-        for _, attn_processor in self.attn_processors.items():
-            if "Added" in str(attn_processor.__class__.__name__):
-                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
-        self.original_attn_processors = self.attn_processors
-        for module in self.modules():
-            if isinstance(module, Attention):
-                module.fuse_projections(fuse=True)
-        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
-    def unfuse_qkv_projections(self):
-        """Disables the fused QKV projection if enabled.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        if self.original_attn_processors is not None:
-            self.set_attn_processor(self.original_attn_processors)
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -863,13 +713,6 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         id_vit_hidden: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
-        # fuse clip and insightface
-        if self.is_train_face:
-            assert id_cond is not None and id_vit_hidden is not None
-            valid_face_emb = self.local_facial_extractor(
-                id_cond, id_vit_hidden
-            )  # torch.Size([1, 1280]), list[5](torch.Size([1, 577, 1024]))  ->  torch.Size([1, 32, 2048])
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
             lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -885,6 +728,17 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
         batch_size, num_frames, channels, height, width = hidden_states.shape
         # 1. Time embedding
@@ -1086,4 +940,4 @@ if __name__ == '__main__':
                     id_cond=id_cond if id_cond is not None else None,
                 )[0]
-    print(model_output)

 import json
 import math
 import os
+from typing import Any, List, Dict, Optional, Tuple, Union
 import torch
 from torch import nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.loaders import PeftAdapterMixin
 from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0
 from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class PerceiverAttention(nn.Module):
+    def __init__(self, dim: int, dim_head: int = 64, heads: int = 8, kv_dim: Optional[int] = None):
         super().__init__()
         self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
         self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, image_embeds: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
         # Apply normalization
+        image_embeds = self.norm1(image_embeds)
         latents = self.norm2(latents)
+        batch_size, seq_len, _ = latents.shape  # Get batch size and sequence length
         # Compute query, key, and value matrices
+        query = self.to_q(latents)
+        kv_input = torch.cat((image_embeds, latents), dim=-2)
+        key, value = self.to_kv(kv_input).chunk(2, dim=-1)
         # Reshape the tensors for multi-head attention
+        query = query.reshape(query.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
+        key = key.reshape(key.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
+        value = value.reshape(value.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
         # attention
         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (query * scale) @ (key * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        output = weight @ value
         # Reshape and return the final output
+        output = output.permute(0, 2, 1, 3).reshape(batch_size, seq_len, -1)
+        return self.to_out(output)
 class LocalFacialExtractor(nn.Module):
     def __init__(
         self,
+        id_dim: int = 1280,
+        vit_dim: int = 1024,
+        depth: int = 10,
+        dim_head: int = 64,
+        heads: int = 16,
+        num_id_token: int = 5,
+        num_queries: int = 32,
+        output_dim: int = 2048,
+        ff_mult: int = 4,
+        num_scale: int = 5,
     ):
         super().__init__()
         # Storing identity token and query information
         self.num_id_token = num_id_token
         self.vit_dim = vit_dim
         self.num_queries = num_queries
+        assert depth % num_scale == 0
+        self.depth = depth // num_scale
+        self.num_scale = num_scale
         scale = vit_dim**-0.5
         # Learnable latent query embeddings
                 nn.ModuleList(
                     [
                         PerceiverAttention(dim=vit_dim, dim_head=dim_head, heads=heads),  # Perceiver Attention layer
+                        nn.Sequential(
+                            nn.LayerNorm(vit_dim),
+                            nn.Linear(vit_dim, vit_dim * ff_mult, bias=False),
+                            nn.GELU(),
+                            nn.Linear(vit_dim * ff_mult, vit_dim, bias=False),
+                        ),  # ConsisIDFeedForward layer
                     ]
                 )
             )
         # Mappings for each of the 5 different ViT features
+        for i in range(num_scale):
             setattr(
                 self,
                 f"mapping_{i}",
             nn.Linear(vit_dim, vit_dim * num_id_token),
         )
+    def forward(self, id_embeds: torch.Tensor, vit_hidden_states: List[torch.Tensor]) -> torch.Tensor:
         # Repeat latent queries for the batch size
+        latents = self.latents.repeat(id_embeds.size(0), 1, 1)
         # Map the identity embedding to tokens
+        id_embeds = self.id_embedding_mapping(id_embeds)
+        id_embeds = id_embeds.reshape(-1, self.num_id_token, self.vit_dim)
         # Concatenate identity tokens with the latent queries
+        latents = torch.cat((latents, id_embeds), dim=1)
+        # Process each of the num_scale visual feature inputs
+        for i in range(self.num_scale):
+            vit_feature = getattr(self, f"mapping_{i}")(vit_hidden_states[i])
+            ctx_feature = torch.cat((id_embeds, vit_feature), dim=1)
             # Pass through the PerceiverAttention and ConsisIDFeedForward layers
             for attn, ff in self.layers[i * self.depth : (i + 1) * self.depth]:
 class PerceiverCrossAttention(nn.Module):
+    def __init__(self, dim: int = 3072, dim_head: int = 128, heads: int = 16, kv_dim: int = 2048):
         super().__init__()
         self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
         self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, image_embeds: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
         # Apply layer normalization to the input image and latent features
+        image_embeds = self.norm1(image_embeds)
+        hidden_states = self.norm2(hidden_states)
+        batch_size, seq_len, _ = hidden_states.shape
         # Compute queries, keys, and values
+        query = self.to_q(hidden_states)
+        key, value = self.to_kv(image_embeds).chunk(2, dim=-1)
         # Reshape tensors to split into attention heads
+        query = query.reshape(query.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
+        key = key.reshape(key.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
+        value = value.reshape(value.size(0), -1, self.heads, self.dim_head).transpose(1, 2)
         # Compute attention weights
         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (query * scale) @ (key * scale).transpose(-2, -1)  # More stable scaling than post-division
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         # Compute the output via weighted combination of values
+        out = weight @ value
         # Reshape and permute to prepare for final linear transformation
+        out = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, -1)
         return self.to_out(out)
             The multiplication factor applied to the feed-forward network's hidden layer size in the Local Facial
             Extractor (LFE). A higher value increases the model's capacity to learn more complex facial feature
             transformations, but also increases the computation and memory requirements.
+        LFE_num_scale (`int`, optional, defaults to `5`):
+            The number of different scales visual feature. A higher value increases the model's capacity to learn more
+            complex facial feature transformations, but also increases the computation and memory requirements.
         local_face_scale (`float`, defaults to `1.0`):
             A scaling factor used to adjust the importance of local facial features in the model. This can influence
             how strongly the model focuses on high frequency face-related content.
         LFE_num_querie: int = 32,
         LFE_output_dim: int = 2048,
         LFE_ff_mult: int = 4,
+        LFE_num_scale: int = 5,
         local_face_scale: float = 1.0,
     ):
         super().__init__()
         )
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
         self.is_train_face = is_train_face
         self.is_kps = is_kps
             self.LFE_num_querie = LFE_num_querie
             self.LFE_output_dim = LFE_output_dim
             self.LFE_ff_mult = LFE_ff_mult
+            self.LFE_num_scale = LFE_num_scale
             # cross configs
             self.inner_dim = inner_dim
             self.cross_attn_interval = cross_attn_interval
             # face modules
             self._init_face_inputs()
+        self.gradient_checkpointing = False
     def _set_gradient_checkpointing(self, module, value=False):
         self.gradient_checkpointing = value
             num_queries=self.LFE_num_querie,
             output_dim=self.LFE_output_dim,
             ff_mult=self.LFE_ff_mult,
+            num_scale=self.LFE_num_scale,
+        ).to(device, dtype=weight_dtype)
         self.perceiver_cross_attention = nn.ModuleList(
             [
                 PerceiverCrossAttention(
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
     def forward(
         self,
         hidden_states: torch.Tensor,
         id_vit_hidden: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
         if attention_kwargs is not None:
             attention_kwargs = attention_kwargs.copy()
             lora_scale = attention_kwargs.pop("scale", 1.0)
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
+        # fuse clip and insightface
+        valid_face_emb = None
+        if self.is_train_face:
+            id_cond = id_cond.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            id_vit_hidden = [
+                tensor.to(device=hidden_states.device, dtype=hidden_states.dtype) for tensor in id_vit_hidden
+            ]
+            valid_face_emb = self.local_facial_extractor(
+                id_cond, id_vit_hidden
+            )  # torch.Size([1, 1280]), list[5](torch.Size([1, 577, 1024]))  ->  torch.Size([1, 32, 2048])
         batch_size, num_frames, channels, height, width = hidden_states.shape
         # 1. Time embedding
                     id_cond=id_cond if id_cond is not None else None,
                 )[0]
+    print(model_output)