Spaces:
Sleeping
Sleeping
update 5b
Browse files- README.md +3 -2
- app.py +23 -10
- videosys/core/engine.py +2 -2
- videosys/core/pab_mgr.py +1 -0
- videosys/core/pipeline.py +18 -0
- videosys/pipelines/cogvideox/pipeline_cogvideox.py +66 -13
- videosys/pipelines/latte/pipeline_latte.py +54 -10
- videosys/pipelines/open_sora/pipeline_open_sora.py +72 -20
- videosys/pipelines/open_sora_plan/pipeline_open_sora_plan.py +56 -14
- videosys/utils/utils.py +3 -3
README.md
CHANGED
@@ -11,10 +11,11 @@ app_port: 7860
|
|
11 |
app_file: app.py
|
12 |
models:
|
13 |
- THUDM/CogVideoX-2b
|
|
|
14 |
tags:
|
15 |
- cogvideox
|
16 |
- video-generation
|
17 |
-
-
|
18 |
short_description: Text-to-Video
|
19 |
disable_embedding: false
|
20 |
-
---
|
|
|
11 |
app_file: app.py
|
12 |
models:
|
13 |
- THUDM/CogVideoX-2b
|
14 |
+
- THUDM/CogVideoX-5b
|
15 |
tags:
|
16 |
- cogvideox
|
17 |
- video-generation
|
18 |
+
- videosys
|
19 |
short_description: Text-to-Video
|
20 |
disable_embedding: false
|
21 |
+
---
|
app.py
CHANGED
@@ -13,9 +13,9 @@ import spaces
|
|
13 |
from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
|
14 |
|
15 |
|
16 |
-
def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
|
17 |
pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
|
18 |
-
config = CogVideoXConfig(
|
19 |
engine = VideoSysEngine(config)
|
20 |
return engine
|
21 |
|
@@ -50,15 +50,16 @@ def get_server_status():
|
|
50 |
return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
|
51 |
|
52 |
|
53 |
-
@spaces.GPU(duration=
|
54 |
-
def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
|
55 |
-
engine = load_model()
|
56 |
video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
|
57 |
return video_path
|
58 |
|
59 |
|
60 |
-
@spaces.GPU(duration=
|
61 |
def generate_vs(
|
|
|
62 |
prompt,
|
63 |
num_inference_steps,
|
64 |
guidance_scale,
|
@@ -69,7 +70,7 @@ def generate_vs(
|
|
69 |
):
|
70 |
threshold = [int(threshold_end), int(threshold_start)]
|
71 |
gap = int(gap)
|
72 |
-
engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
|
73 |
video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
|
74 |
return video_path
|
75 |
|
@@ -202,10 +203,14 @@ with gr.Blocks(css=css) as demo:
|
|
202 |
|
203 |
with gr.Row():
|
204 |
with gr.Column():
|
205 |
-
prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=
|
206 |
|
207 |
with gr.Column():
|
208 |
gr.Markdown("**Generation Parameters**<br>")
|
|
|
|
|
|
|
|
|
209 |
with gr.Row():
|
210 |
num_inference_steps = gr.Number(label="Inference Steps", value=50)
|
211 |
guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
|
@@ -240,7 +245,7 @@ with gr.Blocks(css=css) as demo:
|
|
240 |
|
241 |
generate_button.click(
|
242 |
generate_vanilla,
|
243 |
-
inputs=[prompt, num_inference_steps, guidance_scale],
|
244 |
outputs=[video_output],
|
245 |
concurrency_id="gen",
|
246 |
concurrency_limit=1,
|
@@ -248,7 +253,15 @@ with gr.Blocks(css=css) as demo:
|
|
248 |
|
249 |
generate_button_vs.click(
|
250 |
generate_vs,
|
251 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
outputs=[video_output_vs],
|
253 |
concurrency_id="gen",
|
254 |
concurrency_limit=1,
|
|
|
13 |
from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
|
14 |
|
15 |
|
16 |
+
def load_model(model_name, enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
|
17 |
pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
|
18 |
+
config = CogVideoXConfig(model_name, enable_pab=enable_video_sys, pab_config=pab_config)
|
19 |
engine = VideoSysEngine(config)
|
20 |
return engine
|
21 |
|
|
|
50 |
return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
|
51 |
|
52 |
|
53 |
+
@spaces.GPU(duration=400)
|
54 |
+
def generate_vanilla(model_name, prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
|
55 |
+
engine = load_model(model_name)
|
56 |
video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
|
57 |
return video_path
|
58 |
|
59 |
|
60 |
+
@spaces.GPU(duration=360)
|
61 |
def generate_vs(
|
62 |
+
model_name,
|
63 |
prompt,
|
64 |
num_inference_steps,
|
65 |
guidance_scale,
|
|
|
70 |
):
|
71 |
threshold = [int(threshold_end), int(threshold_start)]
|
72 |
gap = int(gap)
|
73 |
+
engine = load_model(model_name, enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
|
74 |
video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
|
75 |
return video_path
|
76 |
|
|
|
203 |
|
204 |
with gr.Row():
|
205 |
with gr.Column():
|
206 |
+
prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=3)
|
207 |
|
208 |
with gr.Column():
|
209 |
gr.Markdown("**Generation Parameters**<br>")
|
210 |
+
with gr.Row():
|
211 |
+
model_name = gr.Dropdown(
|
212 |
+
["THUDM/CogVideoX-2b", "THUDM/CogVideoX-5b"], label="Model Type", value="THUDM/CogVideoX-2b"
|
213 |
+
)
|
214 |
with gr.Row():
|
215 |
num_inference_steps = gr.Number(label="Inference Steps", value=50)
|
216 |
guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
|
|
|
245 |
|
246 |
generate_button.click(
|
247 |
generate_vanilla,
|
248 |
+
inputs=[model_name, prompt, num_inference_steps, guidance_scale],
|
249 |
outputs=[video_output],
|
250 |
concurrency_id="gen",
|
251 |
concurrency_limit=1,
|
|
|
253 |
|
254 |
generate_button_vs.click(
|
255 |
generate_vs,
|
256 |
+
inputs=[
|
257 |
+
model_name,
|
258 |
+
prompt,
|
259 |
+
num_inference_steps,
|
260 |
+
guidance_scale,
|
261 |
+
pab_threshold_start,
|
262 |
+
pab_threshold_end,
|
263 |
+
pab_range,
|
264 |
+
],
|
265 |
outputs=[video_output_vs],
|
266 |
concurrency_id="gen",
|
267 |
concurrency_limit=1,
|
videosys/core/engine.py
CHANGED
@@ -20,7 +20,7 @@ class VideoSysEngine:
|
|
20 |
self._init_worker(config.pipeline_cls)
|
21 |
|
22 |
def _init_worker(self, pipeline_cls):
|
23 |
-
world_size = self.config.
|
24 |
|
25 |
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
26 |
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
|
@@ -68,7 +68,7 @@ class VideoSysEngine:
|
|
68 |
|
69 |
# TODO: add more options here for pipeline, or wrap all options into config
|
70 |
def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
|
71 |
-
videosys.initialize(rank=rank, world_size=self.config.
|
72 |
|
73 |
pipeline = pipeline_cls(self.config)
|
74 |
return pipeline
|
|
|
20 |
self._init_worker(config.pipeline_cls)
|
21 |
|
22 |
def _init_worker(self, pipeline_cls):
|
23 |
+
world_size = self.config.num_gpus
|
24 |
|
25 |
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
26 |
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
|
|
|
68 |
|
69 |
# TODO: add more options here for pipeline, or wrap all options into config
|
70 |
def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
|
71 |
+
videosys.initialize(rank=rank, world_size=self.config.num_gpus, init_method=distributed_init_method, seed=42)
|
72 |
|
73 |
pipeline = pipeline_cls(self.config)
|
74 |
return pipeline
|
videosys/core/pab_mgr.py
CHANGED
@@ -94,6 +94,7 @@ class PABManager:
|
|
94 |
@staticmethod
|
95 |
def _is_t_in_skip_config(all_timesteps, timestep, config):
|
96 |
is_t_in_skip_config = False
|
|
|
97 |
for key in config:
|
98 |
if key not in all_timesteps:
|
99 |
continue
|
|
|
94 |
@staticmethod
|
95 |
def _is_t_in_skip_config(all_timesteps, timestep, config):
|
96 |
is_t_in_skip_config = False
|
97 |
+
skip_range = None
|
98 |
for key in config:
|
99 |
if key not in all_timesteps:
|
100 |
continue
|
videosys/core/pipeline.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from abc import abstractmethod
|
2 |
from dataclasses import dataclass
|
3 |
|
@@ -28,6 +29,23 @@ class VideoSysPipeline(DiffusionPipeline):
|
|
28 |
"""
|
29 |
return self.generate(*args, **kwargs)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
@dataclass
|
33 |
class VideoSysPipelineOutput(BaseOutput):
|
|
|
1 |
+
import inspect
|
2 |
from abc import abstractmethod
|
3 |
from dataclasses import dataclass
|
4 |
|
|
|
29 |
"""
|
30 |
return self.generate(*args, **kwargs)
|
31 |
|
32 |
+
@classmethod
|
33 |
+
def _get_signature_keys(cls, obj):
|
34 |
+
parameters = inspect.signature(obj.__init__).parameters
|
35 |
+
required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
|
36 |
+
optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
|
37 |
+
expected_modules = set(required_parameters.keys()) - {"self"}
|
38 |
+
# modify: remove the config module from the expected modules
|
39 |
+
expected_modules = expected_modules - {"config"}
|
40 |
+
|
41 |
+
optional_names = list(optional_parameters)
|
42 |
+
for name in optional_names:
|
43 |
+
if name in cls._optional_components:
|
44 |
+
expected_modules.add(name)
|
45 |
+
optional_parameters.remove(name)
|
46 |
+
|
47 |
+
return expected_modules, optional_parameters
|
48 |
+
|
49 |
|
50 |
@dataclass
|
51 |
class VideoSysPipelineOutput(BaseOutput):
|
videosys/pipelines/cogvideox/pipeline_cogvideox.py
CHANGED
@@ -46,30 +46,75 @@ class CogVideoXPABConfig(PABConfig):
|
|
46 |
|
47 |
|
48 |
class CogVideoXConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def __init__(
|
50 |
self,
|
51 |
model_path: str = "THUDM/CogVideoX-2b",
|
52 |
-
|
|
|
|
|
|
|
53 |
vae_tiling: bool = True,
|
|
|
54 |
enable_pab: bool = False,
|
55 |
pab_config=CogVideoXPABConfig(),
|
56 |
):
|
57 |
-
|
58 |
-
self.world_size = world_size
|
59 |
-
|
60 |
-
# ======= pipeline ========
|
61 |
self.pipeline_cls = CogVideoXPipeline
|
62 |
-
|
|
|
|
|
|
|
63 |
self.vae_tiling = vae_tiling
|
64 |
-
|
65 |
-
# ======= model ========
|
66 |
-
self.model_path = model_path
|
67 |
self.enable_pab = enable_pab
|
68 |
self.pab_config = pab_config
|
69 |
|
70 |
|
71 |
class CogVideoXPipeline(VideoSysPipeline):
|
72 |
-
_optional_components = []
|
73 |
model_cpu_offload_seq = "text_encoder->transformer->vae"
|
74 |
_callback_tensor_inputs = [
|
75 |
"latents",
|
@@ -86,11 +131,13 @@ class CogVideoXPipeline(VideoSysPipeline):
|
|
86 |
transformer: Optional[CogVideoXTransformer3DModel] = None,
|
87 |
scheduler: Optional[CogVideoXDDIMScheduler] = None,
|
88 |
device: torch.device = torch.device("cuda"),
|
89 |
-
dtype: torch.dtype = torch.
|
90 |
):
|
91 |
super().__init__()
|
92 |
self._config = config
|
93 |
self._device = device
|
|
|
|
|
94 |
self._dtype = dtype
|
95 |
|
96 |
if transformer is None:
|
@@ -99,8 +146,6 @@ class CogVideoXPipeline(VideoSysPipeline):
|
|
99 |
)
|
100 |
if vae is None:
|
101 |
vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
|
102 |
-
if config.vae_tiling:
|
103 |
-
vae.enable_tiling(tile_sample_min_height=vae.tile_sample_min_height // 2)
|
104 |
if tokenizer is None:
|
105 |
tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
|
106 |
if text_encoder is None:
|
@@ -120,6 +165,14 @@ class CogVideoXPipeline(VideoSysPipeline):
|
|
120 |
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
|
121 |
)
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
# pab
|
124 |
if config.enable_pab:
|
125 |
set_pab_manager(config.pab_config)
|
|
|
46 |
|
47 |
|
48 |
class CogVideoXConfig:
|
49 |
+
"""
|
50 |
+
This config is to instantiate a `CogVideoXPipeline` class for video generation.
|
51 |
+
|
52 |
+
To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
|
53 |
+
In the engine, it will be used to instantiate the corresponding pipeline class.
|
54 |
+
And the engine will call the `generate` function of the pipeline to generate the video.
|
55 |
+
If you want to explore the detail of generation, please refer to the pipeline class below.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
model_path (str):
|
59 |
+
A path to the pretrained pipeline. Defaults to "THUDM/CogVideoX-2b".
|
60 |
+
num_gpus (int):
|
61 |
+
The number of GPUs to use. Defaults to 1.
|
62 |
+
cpu_offload (bool):
|
63 |
+
Whether to enable CPU offload. Defaults to False.
|
64 |
+
vae_tiling (bool):
|
65 |
+
Whether to enable tiling for the VAE. Defaults to True.
|
66 |
+
enable_pab (bool):
|
67 |
+
Whether to enable Pyramid Attention Broadcast. Defaults to False.
|
68 |
+
pab_config (CogVideoXPABConfig):
|
69 |
+
The configuration for Pyramid Attention Broadcast. Defaults to `CogVideoXPABConfig()`.
|
70 |
+
|
71 |
+
Examples:
|
72 |
+
```python
|
73 |
+
from videosys import CogVideoXConfig, VideoSysEngine
|
74 |
+
|
75 |
+
# models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
|
76 |
+
# change num_gpus for multi-gpu inference
|
77 |
+
config = CogVideoXConfig("THUDM/CogVideoX-2b", num_gpus=1)
|
78 |
+
engine = VideoSysEngine(config)
|
79 |
+
|
80 |
+
prompt = "Sunset over the sea."
|
81 |
+
# num frames should be <= 49. resolution is fixed to 720p.
|
82 |
+
video = engine.generate(
|
83 |
+
prompt=prompt,
|
84 |
+
guidance_scale=6,
|
85 |
+
num_inference_steps=50,
|
86 |
+
num_frames=49,
|
87 |
+
).video[0]
|
88 |
+
engine.save_video(video, f"./outputs/{prompt}.mp4")
|
89 |
+
```
|
90 |
+
"""
|
91 |
+
|
92 |
def __init__(
|
93 |
self,
|
94 |
model_path: str = "THUDM/CogVideoX-2b",
|
95 |
+
# ======= distributed ========
|
96 |
+
num_gpus: int = 1,
|
97 |
+
# ======= memory =======
|
98 |
+
cpu_offload: bool = False,
|
99 |
vae_tiling: bool = True,
|
100 |
+
# ======= pab ========
|
101 |
enable_pab: bool = False,
|
102 |
pab_config=CogVideoXPABConfig(),
|
103 |
):
|
104 |
+
self.model_path = model_path
|
|
|
|
|
|
|
105 |
self.pipeline_cls = CogVideoXPipeline
|
106 |
+
# ======= distributed ========
|
107 |
+
self.num_gpus = num_gpus
|
108 |
+
# ======= memory ========
|
109 |
+
self.cpu_offload = cpu_offload
|
110 |
self.vae_tiling = vae_tiling
|
111 |
+
# ======= pab ========
|
|
|
|
|
112 |
self.enable_pab = enable_pab
|
113 |
self.pab_config = pab_config
|
114 |
|
115 |
|
116 |
class CogVideoXPipeline(VideoSysPipeline):
|
117 |
+
_optional_components = ["tokenizer", "text_encoder", "vae", "transformer", "scheduler"]
|
118 |
model_cpu_offload_seq = "text_encoder->transformer->vae"
|
119 |
_callback_tensor_inputs = [
|
120 |
"latents",
|
|
|
131 |
transformer: Optional[CogVideoXTransformer3DModel] = None,
|
132 |
scheduler: Optional[CogVideoXDDIMScheduler] = None,
|
133 |
device: torch.device = torch.device("cuda"),
|
134 |
+
dtype: torch.dtype = torch.bfloat16,
|
135 |
):
|
136 |
super().__init__()
|
137 |
self._config = config
|
138 |
self._device = device
|
139 |
+
if config.model_path == "THUDM/CogVideoX-2b":
|
140 |
+
dtype = torch.float16
|
141 |
self._dtype = dtype
|
142 |
|
143 |
if transformer is None:
|
|
|
146 |
)
|
147 |
if vae is None:
|
148 |
vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
|
|
|
|
|
149 |
if tokenizer is None:
|
150 |
tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
|
151 |
if text_encoder is None:
|
|
|
165 |
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
|
166 |
)
|
167 |
|
168 |
+
# cpu offload
|
169 |
+
if config.cpu_offload:
|
170 |
+
self.enable_model_cpu_offload()
|
171 |
+
|
172 |
+
# vae tiling
|
173 |
+
if config.vae_tiling:
|
174 |
+
vae.enable_tiling()
|
175 |
+
|
176 |
# pab
|
177 |
if config.enable_pab:
|
178 |
set_pab_manager(config.pab_config)
|
videosys/pipelines/latte/pipeline_latte.py
CHANGED
@@ -79,10 +79,59 @@ class LattePABConfig(PABConfig):
|
|
79 |
|
80 |
|
81 |
class LatteConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def __init__(
|
83 |
self,
|
84 |
model_path: str = "maxin-cn/Latte-1",
|
85 |
-
|
|
|
|
|
86 |
enable_vae_temporal_decoder: bool = True,
|
87 |
# ======= scheduler ========
|
88 |
beta_start: float = 0.0001,
|
@@ -93,22 +142,17 @@ class LatteConfig:
|
|
93 |
enable_pab: bool = False,
|
94 |
pab_config: PABConfig = LattePABConfig(),
|
95 |
):
|
96 |
-
# ======= engine ========
|
97 |
-
self.world_size = world_size
|
98 |
-
|
99 |
-
# ======= pipeline ========
|
100 |
-
self.pipeline_cls = LattePipeline
|
101 |
-
|
102 |
-
# ======= model ========
|
103 |
self.model_path = model_path
|
|
|
|
|
|
|
|
|
104 |
self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
|
105 |
-
|
106 |
# ======= scheduler ========
|
107 |
self.beta_start = beta_start
|
108 |
self.beta_end = beta_end
|
109 |
self.beta_schedule = beta_schedule
|
110 |
self.variance_type = variance_type
|
111 |
-
|
112 |
# ======= pab ========
|
113 |
self.enable_pab = enable_pab
|
114 |
self.pab_config = pab_config
|
|
|
79 |
|
80 |
|
81 |
class LatteConfig:
|
82 |
+
"""
|
83 |
+
This config is to instantiate a `LattePipeline` class for video generation.
|
84 |
+
|
85 |
+
To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
|
86 |
+
In the engine, it will be used to instantiate the corresponding pipeline class.
|
87 |
+
And the engine will call the `generate` function of the pipeline to generate the video.
|
88 |
+
If you want to explore the detail of generation, please refer to the pipeline class below.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
model_path (str):
|
92 |
+
A path to the pretrained pipeline. Defaults to "maxin-cn/Latte-1".
|
93 |
+
num_gpus (int):
|
94 |
+
The number of GPUs to use. Defaults to 1.
|
95 |
+
enable_vae_temporal_decoder (bool):
|
96 |
+
Whether to enable VAE Temporal Decoder. Defaults to True.
|
97 |
+
beta_start (float):
|
98 |
+
The initial value of beta for DDIM. Defaults to 0.0001.
|
99 |
+
beta_end (float):
|
100 |
+
The final value of beta for DDIM. Defaults to 0.02.
|
101 |
+
beta_schedule (str):
|
102 |
+
The schedule of beta for DDIM. Defaults to "linear".
|
103 |
+
variance_type (str):
|
104 |
+
The type of variance for DDIM. Defaults to "learned_range".
|
105 |
+
enable_pab (bool):
|
106 |
+
Whether to enable Pyramid Attention Broadcast. Defaults to False.
|
107 |
+
pab_config (CogVideoXPABConfig):
|
108 |
+
The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
|
109 |
+
|
110 |
+
Examples:
|
111 |
+
```python
|
112 |
+
from videosys import LatteConfig, VideoSysEngine
|
113 |
+
|
114 |
+
# change num_gpus for multi-gpu inference
|
115 |
+
config = LatteConfig("maxin-cn/Latte-1", num_gpus=1)
|
116 |
+
engine = VideoSysEngine(config)
|
117 |
+
|
118 |
+
prompt = "Sunset over the sea."
|
119 |
+
# video size is fixed to 16 frames, 512x512.
|
120 |
+
video = engine.generate(
|
121 |
+
prompt=prompt,
|
122 |
+
guidance_scale=7.5,
|
123 |
+
num_inference_steps=50,
|
124 |
+
).video[0]
|
125 |
+
engine.save_video(video, f"./outputs/{prompt}.mp4")
|
126 |
+
```
|
127 |
+
"""
|
128 |
+
|
129 |
def __init__(
|
130 |
self,
|
131 |
model_path: str = "maxin-cn/Latte-1",
|
132 |
+
# ======= distributed =======
|
133 |
+
num_gpus: int = 1,
|
134 |
+
# ======= vae ========
|
135 |
enable_vae_temporal_decoder: bool = True,
|
136 |
# ======= scheduler ========
|
137 |
beta_start: float = 0.0001,
|
|
|
142 |
enable_pab: bool = False,
|
143 |
pab_config: PABConfig = LattePABConfig(),
|
144 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
self.model_path = model_path
|
146 |
+
self.pipeline_cls = LattePipeline
|
147 |
+
# ======= distributed =======
|
148 |
+
self.num_gpus = num_gpus
|
149 |
+
# ======= vae ========
|
150 |
self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
|
|
|
151 |
# ======= scheduler ========
|
152 |
self.beta_start = beta_start
|
153 |
self.beta_end = beta_end
|
154 |
self.beta_schedule = beta_schedule
|
155 |
self.variance_type = variance_type
|
|
|
156 |
# ======= pab ========
|
157 |
self.enable_pab = enable_pab
|
158 |
self.pab_config = pab_config
|
videosys/pipelines/open_sora/pipeline_open_sora.py
CHANGED
@@ -69,38 +69,91 @@ class OpenSoraPABConfig(PABConfig):
|
|
69 |
|
70 |
|
71 |
class OpenSoraConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def __init__(
|
73 |
self,
|
74 |
-
|
75 |
-
world_size: int = 1,
|
76 |
vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
|
77 |
text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
|
78 |
-
#
|
|
|
|
|
79 |
num_sampling_steps: int = 30,
|
80 |
cfg_scale: float = 7.0,
|
81 |
-
#
|
82 |
tiling_size: int = 4,
|
83 |
-
#
|
|
|
|
|
84 |
enable_pab: bool = False,
|
85 |
pab_config: PABConfig = OpenSoraPABConfig(),
|
86 |
):
|
87 |
-
# ======= engine ========
|
88 |
-
self.world_size = world_size
|
89 |
-
|
90 |
-
# ======= pipeline ========
|
91 |
self.pipeline_cls = OpenSoraPipeline
|
92 |
-
self.transformer =
|
93 |
self.vae = vae
|
94 |
self.text_encoder = text_encoder
|
95 |
-
|
96 |
-
|
|
|
97 |
self.num_sampling_steps = num_sampling_steps
|
98 |
self.cfg_scale = cfg_scale
|
99 |
-
|
100 |
-
# ======= vae ========
|
101 |
self.tiling_size = tiling_size
|
102 |
-
|
103 |
-
|
|
|
104 |
self.enable_pab = enable_pab
|
105 |
self.pab_config = pab_config
|
106 |
|
@@ -157,16 +210,15 @@ class OpenSoraPipeline(VideoSysPipeline):
|
|
157 |
tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
|
158 |
if vae is None:
|
159 |
vae = OpenSoraVAE_V1_2(
|
160 |
-
from_pretrained=
|
161 |
micro_frame_size=17,
|
162 |
micro_batch_size=config.tiling_size,
|
163 |
).to(dtype)
|
164 |
if transformer is None:
|
165 |
transformer = STDiT3_XL_2(
|
166 |
-
from_pretrained=
|
167 |
qk_norm=True,
|
168 |
-
enable_flash_attn=
|
169 |
-
enable_layernorm_kernel=True,
|
170 |
in_channels=vae.out_channels,
|
171 |
caption_channels=text_encoder.config.d_model,
|
172 |
model_max_length=300,
|
|
|
69 |
|
70 |
|
71 |
class OpenSoraConfig:
|
72 |
+
"""
|
73 |
+
This config is to instantiate a `OpenSoraPipeline` class for video generation.
|
74 |
+
|
75 |
+
To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
|
76 |
+
In the engine, it will be used to instantiate the corresponding pipeline class.
|
77 |
+
And the engine will call the `generate` function of the pipeline to generate the video.
|
78 |
+
If you want to explore the detail of generation, please refer to the pipeline class below.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
transformer (str):
|
82 |
+
The transformer model to use. Defaults to "hpcai-tech/OpenSora-STDiT-v3".
|
83 |
+
vae (str):
|
84 |
+
The VAE model to use. Defaults to "hpcai-tech/OpenSora-VAE-v1.2".
|
85 |
+
text_encoder (str):
|
86 |
+
The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
|
87 |
+
num_gpus (int):
|
88 |
+
The number of GPUs to use. Defaults to 1.
|
89 |
+
num_sampling_steps (int):
|
90 |
+
The number of sampling steps. Defaults to 30.
|
91 |
+
cfg_scale (float):
|
92 |
+
The configuration scale. Defaults to 7.0.
|
93 |
+
tiling_size (int):
|
94 |
+
The tiling size. Defaults to 4.
|
95 |
+
enable_flash_attn (bool):
|
96 |
+
Whether to enable Flash Attention. Defaults to False.
|
97 |
+
enable_pab (bool):
|
98 |
+
Whether to enable Pyramid Attention Broadcast. Defaults to False.
|
99 |
+
pab_config (CogVideoXPABConfig):
|
100 |
+
The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
|
101 |
+
|
102 |
+
Examples:
|
103 |
+
```python
|
104 |
+
from videosys import OpenSoraConfig, VideoSysEngine
|
105 |
+
|
106 |
+
# change num_gpus for multi-gpu inference
|
107 |
+
# sampling parameters are defined in the config
|
108 |
+
config = OpenSoraConfig(num_sampling_steps=30, cfg_scale=7.0, num_gpus=1)
|
109 |
+
engine = VideoSysEngine(config)
|
110 |
+
|
111 |
+
prompt = "Sunset over the sea."
|
112 |
+
# num frames: 2s, 4s, 8s, 16s
|
113 |
+
# resolution: 144p, 240p, 360p, 480p, 720p
|
114 |
+
# aspect ratio: 9:16, 16:9, 3:4, 4:3, 1:1
|
115 |
+
video = engine.generate(
|
116 |
+
prompt=prompt,
|
117 |
+
resolution="480p",
|
118 |
+
aspect_ratio="9:16",
|
119 |
+
num_frames="2s",
|
120 |
+
).video[0]
|
121 |
+
engine.save_video(video, f"./outputs/{prompt}.mp4")
|
122 |
+
```
|
123 |
+
"""
|
124 |
+
|
125 |
def __init__(
|
126 |
self,
|
127 |
+
transformer: str = "hpcai-tech/OpenSora-STDiT-v3",
|
|
|
128 |
vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
|
129 |
text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
|
130 |
+
# ======== distributed ========
|
131 |
+
num_gpus: int = 1,
|
132 |
+
# ======== scheduler ========
|
133 |
num_sampling_steps: int = 30,
|
134 |
cfg_scale: float = 7.0,
|
135 |
+
# ======== vae ========
|
136 |
tiling_size: int = 4,
|
137 |
+
# ======== speedup ========
|
138 |
+
enable_flash_attn: bool = False,
|
139 |
+
# ======== pab ========
|
140 |
enable_pab: bool = False,
|
141 |
pab_config: PABConfig = OpenSoraPABConfig(),
|
142 |
):
|
|
|
|
|
|
|
|
|
143 |
self.pipeline_cls = OpenSoraPipeline
|
144 |
+
self.transformer = transformer
|
145 |
self.vae = vae
|
146 |
self.text_encoder = text_encoder
|
147 |
+
# ======== distributed ========
|
148 |
+
self.num_gpus = num_gpus
|
149 |
+
# ======== scheduler ========
|
150 |
self.num_sampling_steps = num_sampling_steps
|
151 |
self.cfg_scale = cfg_scale
|
152 |
+
# ======== vae ========
|
|
|
153 |
self.tiling_size = tiling_size
|
154 |
+
# ======== speedup ========
|
155 |
+
self.enable_flash_attn = enable_flash_attn
|
156 |
+
# ======== pab ========
|
157 |
self.enable_pab = enable_pab
|
158 |
self.pab_config = pab_config
|
159 |
|
|
|
210 |
tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
|
211 |
if vae is None:
|
212 |
vae = OpenSoraVAE_V1_2(
|
213 |
+
from_pretrained=config.vae,
|
214 |
micro_frame_size=17,
|
215 |
micro_batch_size=config.tiling_size,
|
216 |
).to(dtype)
|
217 |
if transformer is None:
|
218 |
transformer = STDiT3_XL_2(
|
219 |
+
from_pretrained=config.transformer,
|
220 |
qk_norm=True,
|
221 |
+
enable_flash_attn=config.enable_flash_attn,
|
|
|
222 |
in_channels=vae.out_channels,
|
223 |
caption_channels=text_encoder.config.d_model,
|
224 |
model_max_length=300,
|
videosys/pipelines/open_sora_plan/pipeline_open_sora_plan.py
CHANGED
@@ -114,13 +114,61 @@ class OpenSoraPlanPABConfig(PABConfig):
|
|
114 |
|
115 |
|
116 |
class OpenSoraPlanConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def __init__(
|
118 |
self,
|
119 |
-
|
120 |
-
world_size: int = 1,
|
121 |
-
num_frames: int = 65,
|
122 |
ae: str = "CausalVAEModel_4x8x8",
|
123 |
text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
|
|
|
|
|
|
|
124 |
# ======= vae =======
|
125 |
enable_tiling: bool = True,
|
126 |
tile_overlap_factor: float = 0.25,
|
@@ -128,24 +176,18 @@ class OpenSoraPlanConfig:
|
|
128 |
enable_pab: bool = False,
|
129 |
pab_config: PABConfig = OpenSoraPlanPABConfig(),
|
130 |
):
|
131 |
-
# ======= engine ========
|
132 |
-
self.world_size = world_size
|
133 |
-
|
134 |
-
# ======= pipeline ========
|
135 |
self.pipeline_cls = OpenSoraPlanPipeline
|
136 |
self.ae = ae
|
137 |
self.text_encoder = text_encoder
|
138 |
-
|
139 |
-
# ======= model ========
|
140 |
-
self.model_path = model_path
|
141 |
assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
|
142 |
self.num_frames = num_frames
|
143 |
self.version = f"{num_frames}x512x512"
|
144 |
-
|
|
|
145 |
# ======= vae ========
|
146 |
self.enable_tiling = enable_tiling
|
147 |
self.tile_overlap_factor = tile_overlap_factor
|
148 |
-
|
149 |
# ======= pab ========
|
150 |
self.enable_pab = enable_pab
|
151 |
self.pab_config = pab_config
|
@@ -200,9 +242,9 @@ class OpenSoraPlanPipeline(VideoSysPipeline):
|
|
200 |
if text_encoder is None:
|
201 |
text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
|
202 |
if vae is None:
|
203 |
-
vae = getae_wrapper(config.ae)(config.
|
204 |
if transformer is None:
|
205 |
-
transformer = LatteT2V.from_pretrained(config.
|
206 |
if scheduler is None:
|
207 |
scheduler = PNDMScheduler()
|
208 |
|
|
|
114 |
|
115 |
|
116 |
class OpenSoraPlanConfig:
|
117 |
+
"""
|
118 |
+
This config is to instantiate a `OpenSoraPlanPipeline` class for video generation.
|
119 |
+
|
120 |
+
To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
|
121 |
+
In the engine, it will be used to instantiate the corresponding pipeline class.
|
122 |
+
And the engine will call the `generate` function of the pipeline to generate the video.
|
123 |
+
If you want to explore the detail of generation, please refer to the pipeline class below.
|
124 |
+
|
125 |
+
Args:
|
126 |
+
transformer (str):
|
127 |
+
The transformer model to use. Defaults to "LanguageBind/Open-Sora-Plan-v1.1.0".
|
128 |
+
ae (str):
|
129 |
+
The Autoencoder model to use. Defaults to "CausalVAEModel_4x8x8".
|
130 |
+
text_encoder (str):
|
131 |
+
The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
|
132 |
+
num_frames (int):
|
133 |
+
The number of frames to generate. Must be one of [65, 221].
|
134 |
+
num_gpus (int):
|
135 |
+
The number of GPUs to use. Defaults to 1.
|
136 |
+
enable_tiling (bool):
|
137 |
+
Whether to enable tiling. Defaults to True.
|
138 |
+
tile_overlap_factor (float):
|
139 |
+
The overlap factor for tiling. Defaults to 0.25.
|
140 |
+
enable_pab (bool):
|
141 |
+
Whether to enable Pyramid Attention Broadcast. Defaults to False.
|
142 |
+
pab_config (CogVideoXPABConfig):
|
143 |
+
The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
|
144 |
+
|
145 |
+
Examples:
|
146 |
+
```python
|
147 |
+
from videosys import OpenSoraPlanConfig, VideoSysEngine
|
148 |
+
|
149 |
+
# num frames: 65 or 221
|
150 |
+
# change num_gpus for multi-gpu inference
|
151 |
+
config = OpenSoraPlanConfig(num_frames=65, num_gpus=1)
|
152 |
+
engine = VideoSysEngine(config)
|
153 |
+
|
154 |
+
prompt = "Sunset over the sea."
|
155 |
+
video = engine.generate(
|
156 |
+
prompt=prompt,
|
157 |
+
guidance_scale=7.5,
|
158 |
+
num_inference_steps=150,
|
159 |
+
).video[0]
|
160 |
+
engine.save_video(video, f"./outputs/{prompt}.mp4")
|
161 |
+
```
|
162 |
+
"""
|
163 |
+
|
164 |
def __init__(
|
165 |
self,
|
166 |
+
transformer: str = "LanguageBind/Open-Sora-Plan-v1.1.0",
|
|
|
|
|
167 |
ae: str = "CausalVAEModel_4x8x8",
|
168 |
text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
|
169 |
+
num_frames: int = 65,
|
170 |
+
# ======= distributed ========
|
171 |
+
num_gpus: int = 1,
|
172 |
# ======= vae =======
|
173 |
enable_tiling: bool = True,
|
174 |
tile_overlap_factor: float = 0.25,
|
|
|
176 |
enable_pab: bool = False,
|
177 |
pab_config: PABConfig = OpenSoraPlanPABConfig(),
|
178 |
):
|
|
|
|
|
|
|
|
|
179 |
self.pipeline_cls = OpenSoraPlanPipeline
|
180 |
self.ae = ae
|
181 |
self.text_encoder = text_encoder
|
182 |
+
self.transformer = transformer
|
|
|
|
|
183 |
assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
|
184 |
self.num_frames = num_frames
|
185 |
self.version = f"{num_frames}x512x512"
|
186 |
+
# ======= distributed ========
|
187 |
+
self.num_gpus = num_gpus
|
188 |
# ======= vae ========
|
189 |
self.enable_tiling = enable_tiling
|
190 |
self.tile_overlap_factor = tile_overlap_factor
|
|
|
191 |
# ======= pab ========
|
192 |
self.enable_pab = enable_pab
|
193 |
self.pab_config = pab_config
|
|
|
242 |
if text_encoder is None:
|
243 |
text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
|
244 |
if vae is None:
|
245 |
+
vae = getae_wrapper(config.ae)(config.transformer, subfolder="vae").to(dtype=dtype)
|
246 |
if transformer is None:
|
247 |
+
transformer = LatteT2V.from_pretrained(config.transformer, subfolder=config.version, torch_dtype=dtype)
|
248 |
if scheduler is None:
|
249 |
scheduler = PNDMScheduler()
|
250 |
|
videosys/utils/utils.py
CHANGED
@@ -76,7 +76,7 @@ def save_video(video, output_path, fps):
|
|
76 |
"""
|
77 |
Save a video to disk.
|
78 |
"""
|
|
|
|
|
79 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
80 |
-
|
81 |
-
imageio.mimwrite(output_path, video, fps=fps)
|
82 |
-
dist.barrier()
|
|
|
76 |
"""
|
77 |
Save a video to disk.
|
78 |
"""
|
79 |
+
if dist.is_initialized() and dist.get_rank() != 0:
|
80 |
+
return
|
81 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
82 |
+
imageio.mimwrite(output_path, video, fps=fps)
|
|
|
|