StevenZhang commited on
Commit
86059e4
·
1 Parent(s): cb09eab

init upload

Browse files
Files changed (2) hide show
  1. README.md +38 -21
  2. wan_t2v_fp32_example.py +38 -20
README.md CHANGED
@@ -11,45 +11,62 @@ from diffusers.utils import export_to_video
11
  from torchvision import transforms
12
  import os
13
  import cv2
14
- os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
15
  import numpy as np
16
 
17
- pretrained_model_name_or_path = "./wan_t2v"
18
- transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
19
 
20
- text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
21
- torch_dtype=torch.bfloat16)
22
-
23
- pipe = WanPipeline.from_pretrained(
24
- pretrained_model_name_or_path,
25
- transformer=transformer_t2v,
26
- text_encoder=text_encoder,
27
- )
28
-
29
- negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
30
 
31
  device = "cuda"
32
  seed = 0
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  generator = torch.Generator(device=device).manual_seed(seed)
35
  inputs = {
36
  "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
37
- "negative_prompt": negative_prompt,
38
  "generator": generator,
39
  "num_inference_steps": 50,
40
- "flow_shift": 5.0,
41
  "guidance_scale": 5.0,
42
- "height": 720,
43
- "width": 1280,
44
  "num_frames": 81,
45
  "max_sequence_length": 512,
46
  "output_type": "np"
47
  }
48
 
49
- pipe.enable_model_cpu_offload()
50
-
51
  video = pipe(**inputs).frames[0]
52
 
53
- export_to_video(video, "output.mp4", fps=16)
54
 
55
- ```
 
 
11
  from torchvision import transforms
12
  import os
13
  import cv2
 
14
  import numpy as np
15
 
 
 
16
 
17
+ from pathlib import Path
18
+ import json
19
+ from safetensors.torch import safe_open
 
 
 
 
 
 
 
20
 
21
  device = "cuda"
22
  seed = 0
23
 
24
+ # TODO: impl AutoencoderKLWan
25
+ vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ vae = vae.to(device)
28
+
29
+ # TODO: impl FlowDPMSolverMultistepScheduler
30
+ scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)
31
+
32
+ text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
33
+ tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
34
+
35
+ # 14B
36
+ transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)
37
+ # transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)
38
+
39
+ components = {
40
+ "transformer": transformer,
41
+ "vae": vae,
42
+ "scheduler": scheduler,
43
+ "text_encoder": text_encoder,
44
+ "tokenizer": tokenizer,
45
+ }
46
+ pipe = WanPipeline(**components)
47
+
48
+ pipe.to(device)
49
+
50
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
51
+
52
  generator = torch.Generator(device=device).manual_seed(seed)
53
  inputs = {
54
  "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
55
+ "negative_prompt": negative_prompt, # TODO
56
  "generator": generator,
57
  "num_inference_steps": 50,
58
+ "flow_shift": 3.0,
59
  "guidance_scale": 5.0,
60
+ "height": 480,
61
+ "width": 832,
62
  "num_frames": 81,
63
  "max_sequence_length": 512,
64
  "output_type": "np"
65
  }
66
 
 
 
67
  video = pipe(**inputs).frames[0]
68
 
69
+ print(video.shape)
70
 
71
+ export_to_video(video, "output.mp4", fps=16)
72
+ ```
wan_t2v_fp32_example.py CHANGED
@@ -6,43 +6,61 @@ from diffusers.utils import export_to_video
6
  from torchvision import transforms
7
  import os
8
  import cv2
9
- os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
10
  import numpy as np
11
 
12
- pretrained_model_name_or_path = "./wan_t2v"
13
- transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
14
 
15
- text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
16
- torch_dtype=torch.bfloat16)
17
-
18
- pipe = WanPipeline.from_pretrained(
19
- pretrained_model_name_or_path,
20
- transformer=transformer_t2v,
21
- text_encoder=text_encoder,
22
- )
23
-
24
- negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
25
 
26
  device = "cuda"
27
  seed = 0
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  generator = torch.Generator(device=device).manual_seed(seed)
30
  inputs = {
31
  "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
32
- "negative_prompt": negative_prompt,
33
  "generator": generator,
34
  "num_inference_steps": 50,
35
- "flow_shift": 5.0,
36
  "guidance_scale": 5.0,
37
- "height": 720,
38
- "width": 1280,
39
  "num_frames": 81,
40
  "max_sequence_length": 512,
41
  "output_type": "np"
42
  }
43
 
44
- pipe.enable_model_cpu_offload()
45
-
46
  video = pipe(**inputs).frames[0]
47
 
48
- export_to_video(video, "output.mp4", fps=16)
 
 
 
6
  from torchvision import transforms
7
  import os
8
  import cv2
 
9
  import numpy as np
10
 
 
 
11
 
12
+ from pathlib import Path
13
+ import json
14
+ from safetensors.torch import safe_open
 
 
 
 
 
 
 
15
 
16
  device = "cuda"
17
  seed = 0
18
 
19
+ # TODO: impl AutoencoderKLWan
20
+ vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ vae = vae.to(device)
23
+
24
+ # TODO: impl FlowDPMSolverMultistepScheduler
25
+ scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)
26
+
27
+ text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
28
+ tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
29
+
30
+ # 14B
31
+ transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)
32
+ # transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)
33
+
34
+ components = {
35
+ "transformer": transformer,
36
+ "vae": vae,
37
+ "scheduler": scheduler,
38
+ "text_encoder": text_encoder,
39
+ "tokenizer": tokenizer,
40
+ }
41
+ pipe = WanPipeline(**components)
42
+
43
+ pipe.to(device)
44
+
45
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
46
+
47
  generator = torch.Generator(device=device).manual_seed(seed)
48
  inputs = {
49
  "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
50
+ "negative_prompt": negative_prompt, # TODO
51
  "generator": generator,
52
  "num_inference_steps": 50,
53
+ "flow_shift": 3.0,
54
  "guidance_scale": 5.0,
55
+ "height": 480,
56
+ "width": 832,
57
  "num_frames": 81,
58
  "max_sequence_length": 512,
59
  "output_type": "np"
60
  }
61
 
 
 
62
  video = pipe(**inputs).frames[0]
63
 
64
+ print(video.shape)
65
+
66
+ export_to_video(video, "output.mp4", fps=16)