Text-to-Video
Diffusers
Safetensors
Japanese
English
art
alfredplpl commited on
Commit
a1d0d24
·
verified ·
1 Parent(s): 047ef40

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -15
README.md CHANGED
@@ -84,20 +84,6 @@ text_encoder = AutoModelForCausalLM.from_pretrained(
84
  )
85
  text_encoder=text_encoder.to(device)
86
 
87
- transformer = CogVideoXTransformer3DModel.from_pretrained(
88
- "aidealab/commonvideo",
89
- torch_dtype=torch_dtype
90
- )
91
- transformer=transformer.to(device)
92
-
93
- vae = AutoencoderKLCogVideoX.from_pretrained(
94
- "THUDM/CogVideoX-2b",
95
- subfolder="vae"
96
- )
97
- vae=vae.to(dtype=torch_dtype, device=device)
98
- vae.enable_slicing()
99
- vae.enable_tiling()
100
-
101
  text_inputs = tokenizer(
102
  prompt,
103
  padding="max_length",
@@ -122,6 +108,23 @@ null_text_input_ids = null_text_inputs.input_ids
122
  null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
123
  null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # euler discreate sampler with cfg
126
  z0 = torch.randn(shape, device=device)
127
  latents = z0.detach().clone().to(torch_dtype)
@@ -137,7 +140,9 @@ with torch.no_grad():
137
  pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
138
  latents = latents.detach().clone() + dt * pred.detach().clone()
139
 
140
- # Free vram
 
 
141
  latents = latents / vae.config.scaling_factor
142
  latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
143
  x=vae.decode(latents).sample
 
84
  )
85
  text_encoder=text_encoder.to(device)
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  text_inputs = tokenizer(
88
  prompt,
89
  padding="max_length",
 
108
  null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
109
  null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
110
 
111
+ # Free VRAM
112
+ del text_encoder
113
+
114
+ transformer = CogVideoXTransformer3DModel.from_pretrained(
115
+ "aidealab/commonvideo",
116
+ torch_dtype=torch_dtype
117
+ )
118
+ transformer=transformer.to(device)
119
+
120
+ vae = AutoencoderKLCogVideoX.from_pretrained(
121
+ "THUDM/CogVideoX-2b",
122
+ subfolder="vae"
123
+ )
124
+ vae=vae.to(dtype=torch_dtype, device=device)
125
+ vae.enable_slicing()
126
+ vae.enable_tiling()
127
+
128
  # euler discreate sampler with cfg
129
  z0 = torch.randn(shape, device=device)
130
  latents = z0.detach().clone().to(torch_dtype)
 
140
  pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
141
  latents = latents.detach().clone() + dt * pred.detach().clone()
142
 
143
+ # Free VRAM
144
+ del transformer
145
+
146
  latents = latents / vae.config.scaling_factor
147
  latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
148
  x=vae.decode(latents).sample