Commit
·
86059e4
1
Parent(s):
cb09eab
init upload
Browse files- README.md +38 -21
- wan_t2v_fp32_example.py +38 -20
README.md
CHANGED
@@ -11,45 +11,62 @@ from diffusers.utils import export_to_video
|
|
11 |
from torchvision import transforms
|
12 |
import os
|
13 |
import cv2
|
14 |
-
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
15 |
import numpy as np
|
16 |
|
17 |
-
pretrained_model_name_or_path = "./wan_t2v"
|
18 |
-
transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
pipe = WanPipeline.from_pretrained(
|
24 |
-
pretrained_model_name_or_path,
|
25 |
-
transformer=transformer_t2v,
|
26 |
-
text_encoder=text_encoder,
|
27 |
-
)
|
28 |
-
|
29 |
-
negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
|
30 |
|
31 |
device = "cuda"
|
32 |
seed = 0
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
generator = torch.Generator(device=device).manual_seed(seed)
|
35 |
inputs = {
|
36 |
"prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
|
37 |
-
"negative_prompt": negative_prompt,
|
38 |
"generator": generator,
|
39 |
"num_inference_steps": 50,
|
40 |
-
"flow_shift":
|
41 |
"guidance_scale": 5.0,
|
42 |
-
"height":
|
43 |
-
"width":
|
44 |
"num_frames": 81,
|
45 |
"max_sequence_length": 512,
|
46 |
"output_type": "np"
|
47 |
}
|
48 |
|
49 |
-
pipe.enable_model_cpu_offload()
|
50 |
-
|
51 |
video = pipe(**inputs).frames[0]
|
52 |
|
53 |
-
|
54 |
|
55 |
-
|
|
|
|
11 |
from torchvision import transforms
|
12 |
import os
|
13 |
import cv2
|
|
|
14 |
import numpy as np
|
15 |
|
|
|
|
|
16 |
|
17 |
+
from pathlib import Path
|
18 |
+
import json
|
19 |
+
from safetensors.torch import safe_open
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
device = "cuda"
|
22 |
seed = 0
|
23 |
|
24 |
+
# TODO: impl AutoencoderKLWan
|
25 |
+
vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")
|
26 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
vae = vae.to(device)
|
28 |
+
|
29 |
+
# TODO: impl FlowDPMSolverMultistepScheduler
|
30 |
+
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)
|
31 |
+
|
32 |
+
text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
|
34 |
+
|
35 |
+
# 14B
|
36 |
+
transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)
|
37 |
+
# transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)
|
38 |
+
|
39 |
+
components = {
|
40 |
+
"transformer": transformer,
|
41 |
+
"vae": vae,
|
42 |
+
"scheduler": scheduler,
|
43 |
+
"text_encoder": text_encoder,
|
44 |
+
"tokenizer": tokenizer,
|
45 |
+
}
|
46 |
+
pipe = WanPipeline(**components)
|
47 |
+
|
48 |
+
pipe.to(device)
|
49 |
+
|
50 |
+
negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
|
51 |
+
|
52 |
generator = torch.Generator(device=device).manual_seed(seed)
|
53 |
inputs = {
|
54 |
"prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
|
55 |
+
"negative_prompt": negative_prompt, # TODO
|
56 |
"generator": generator,
|
57 |
"num_inference_steps": 50,
|
58 |
+
"flow_shift": 3.0,
|
59 |
"guidance_scale": 5.0,
|
60 |
+
"height": 480,
|
61 |
+
"width": 832,
|
62 |
"num_frames": 81,
|
63 |
"max_sequence_length": 512,
|
64 |
"output_type": "np"
|
65 |
}
|
66 |
|
|
|
|
|
67 |
video = pipe(**inputs).frames[0]
|
68 |
|
69 |
+
print(video.shape)
|
70 |
|
71 |
+
export_to_video(video, "output.mp4", fps=16)
|
72 |
+
```
|
wan_t2v_fp32_example.py
CHANGED
@@ -6,43 +6,61 @@ from diffusers.utils import export_to_video
|
|
6 |
from torchvision import transforms
|
7 |
import os
|
8 |
import cv2
|
9 |
-
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
10 |
import numpy as np
|
11 |
|
12 |
-
pretrained_model_name_or_path = "./wan_t2v"
|
13 |
-
transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
pipe = WanPipeline.from_pretrained(
|
19 |
-
pretrained_model_name_or_path,
|
20 |
-
transformer=transformer_t2v,
|
21 |
-
text_encoder=text_encoder,
|
22 |
-
)
|
23 |
-
|
24 |
-
negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
|
25 |
|
26 |
device = "cuda"
|
27 |
seed = 0
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
generator = torch.Generator(device=device).manual_seed(seed)
|
30 |
inputs = {
|
31 |
"prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
|
32 |
-
"negative_prompt": negative_prompt,
|
33 |
"generator": generator,
|
34 |
"num_inference_steps": 50,
|
35 |
-
"flow_shift":
|
36 |
"guidance_scale": 5.0,
|
37 |
-
"height":
|
38 |
-
"width":
|
39 |
"num_frames": 81,
|
40 |
"max_sequence_length": 512,
|
41 |
"output_type": "np"
|
42 |
}
|
43 |
|
44 |
-
pipe.enable_model_cpu_offload()
|
45 |
-
|
46 |
video = pipe(**inputs).frames[0]
|
47 |
|
48 |
-
|
|
|
|
|
|
6 |
from torchvision import transforms
|
7 |
import os
|
8 |
import cv2
|
|
|
9 |
import numpy as np
|
10 |
|
|
|
|
|
11 |
|
12 |
+
from pathlib import Path
|
13 |
+
import json
|
14 |
+
from safetensors.torch import safe_open
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
device = "cuda"
|
17 |
seed = 0
|
18 |
|
19 |
+
# TODO: impl AutoencoderKLWan
|
20 |
+
vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")
|
21 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
+
vae = vae.to(device)
|
23 |
+
|
24 |
+
# TODO: impl FlowDPMSolverMultistepScheduler
|
25 |
+
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)
|
26 |
+
|
27 |
+
text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
|
29 |
+
|
30 |
+
# 14B
|
31 |
+
transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)
|
32 |
+
# transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)
|
33 |
+
|
34 |
+
components = {
|
35 |
+
"transformer": transformer,
|
36 |
+
"vae": vae,
|
37 |
+
"scheduler": scheduler,
|
38 |
+
"text_encoder": text_encoder,
|
39 |
+
"tokenizer": tokenizer,
|
40 |
+
}
|
41 |
+
pipe = WanPipeline(**components)
|
42 |
+
|
43 |
+
pipe.to(device)
|
44 |
+
|
45 |
+
negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
|
46 |
+
|
47 |
generator = torch.Generator(device=device).manual_seed(seed)
|
48 |
inputs = {
|
49 |
"prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
|
50 |
+
"negative_prompt": negative_prompt, # TODO
|
51 |
"generator": generator,
|
52 |
"num_inference_steps": 50,
|
53 |
+
"flow_shift": 3.0,
|
54 |
"guidance_scale": 5.0,
|
55 |
+
"height": 480,
|
56 |
+
"width": 832,
|
57 |
"num_frames": 81,
|
58 |
"max_sequence_length": 512,
|
59 |
"output_type": "np"
|
60 |
}
|
61 |
|
|
|
|
|
62 |
video = pipe(**inputs).frames[0]
|
63 |
|
64 |
+
print(video.shape)
|
65 |
+
|
66 |
+
export_to_video(video, "output.mp4", fps=16)
|