StevenZhang commited on
Commit
cb09eab
·
1 Parent(s): a7c588c

init upload

Browse files
README.md CHANGED
@@ -1,3 +1,55 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ ```
6
+ import torch
7
+ from transformers import AutoTokenizer, UMT5EncoderModel
8
+ from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler
9
+ from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
10
+ from diffusers.utils import export_to_video
11
+ from torchvision import transforms
12
+ import os
13
+ import cv2
14
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
15
+ import numpy as np
16
+
17
+ pretrained_model_name_or_path = "./wan_t2v"
18
+ transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
19
+
20
+ text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
21
+ torch_dtype=torch.bfloat16)
22
+
23
+ pipe = WanPipeline.from_pretrained(
24
+ pretrained_model_name_or_path,
25
+ transformer=transformer_t2v,
26
+ text_encoder=text_encoder,
27
+ )
28
+
29
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
30
+
31
+ device = "cuda"
32
+ seed = 0
33
+
34
+ generator = torch.Generator(device=device).manual_seed(seed)
35
+ inputs = {
36
+ "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
37
+ "negative_prompt": negative_prompt,
38
+ "generator": generator,
39
+ "num_inference_steps": 50,
40
+ "flow_shift": 5.0,
41
+ "guidance_scale": 5.0,
42
+ "height": 720,
43
+ "width": 1280,
44
+ "num_frames": 81,
45
+ "max_sequence_length": 512,
46
+ "output_type": "np"
47
+ }
48
+
49
+ pipe.enable_model_cpu_offload()
50
+
51
+ video = pipe(**inputs).frames[0]
52
+
53
+ export_to_video(video, "output.mp4", fps=16)
54
+
55
+ ```
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanTransformer3DModel",
3
+ "_diffusers_version": "0.33.0.dev0",
4
+ "add_img_emb": false,
5
+ "added_kv_proj_dim": null,
6
+ "attention_head_dim": 128,
7
+ "cross_attn_norm": true,
8
+ "eps": 1e-06,
9
+ "ffn_dim": 8960,
10
+ "freq_dim": 256,
11
+ "in_channels": 16,
12
+ "num_attention_heads": 12,
13
+ "num_layers": 30,
14
+ "out_channels": 16,
15
+ "patch_size": [
16
+ 1,
17
+ 2,
18
+ 2
19
+ ],
20
+ "qk_norm": true,
21
+ "text_dim": 4096,
22
+ "window_size": [
23
+ -1,
24
+ -1
25
+ ]
26
+ }
diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8d923aaa810641cda15b19bc222d0e7197bfd257f525c2ab2407bc52de1f69
3
+ size 5676069600
wan_t2v_fp32_example.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, UMT5EncoderModel
3
+ from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler
4
+ from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
5
+ from diffusers.utils import export_to_video
6
+ from torchvision import transforms
7
+ import os
8
+ import cv2
9
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
10
+ import numpy as np
11
+
12
+ pretrained_model_name_or_path = "./wan_t2v"
13
+ transformer_t2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer')
14
+
15
+ text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
16
+ torch_dtype=torch.bfloat16)
17
+
18
+ pipe = WanPipeline.from_pretrained(
19
+ pretrained_model_name_or_path,
20
+ transformer=transformer_t2v,
21
+ text_encoder=text_encoder,
22
+ )
23
+
24
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
25
+
26
+ device = "cuda"
27
+ seed = 0
28
+
29
+ generator = torch.Generator(device=device).manual_seed(seed)
30
+ inputs = {
31
+ "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",
32
+ "negative_prompt": negative_prompt,
33
+ "generator": generator,
34
+ "num_inference_steps": 50,
35
+ "flow_shift": 5.0,
36
+ "guidance_scale": 5.0,
37
+ "height": 720,
38
+ "width": 1280,
39
+ "num_frames": 81,
40
+ "max_sequence_length": 512,
41
+ "output_type": "np"
42
+ }
43
+
44
+ pipe.enable_model_cpu_offload()
45
+
46
+ video = pipe(**inputs).frames[0]
47
+
48
+ export_to_video(video, "output.mp4", fps=16)