KoolCogVideoX

Sleeping

App Files Files Community

oahzxl commited on Aug 30

Commit

a28e78a

•

1 Parent(s): a112eac

update 5b

Browse files

Files changed (10) hide show

README.md +3 -2
app.py +23 -10
videosys/core/engine.py +2 -2
videosys/core/pab_mgr.py +1 -0
videosys/core/pipeline.py +18 -0
videosys/pipelines/cogvideox/pipeline_cogvideox.py +66 -13
videosys/pipelines/latte/pipeline_latte.py +54 -10
videosys/pipelines/open_sora/pipeline_open_sora.py +72 -20
videosys/pipelines/open_sora_plan/pipeline_open_sora_plan.py +56 -14
videosys/utils/utils.py +3 -3

README.md CHANGED Viewed

@@ -11,10 +11,11 @@ app_port: 7860
 app_file: app.py
 models:
   - THUDM/CogVideoX-2b
 tags:
   - cogvideox
   - video-generation
-  - thudm
 short_description: Text-to-Video
 disable_embedding: false
----

 app_file: app.py
 models:
   - THUDM/CogVideoX-2b
+  - THUDM/CogVideoX-5b
 tags:
   - cogvideox
   - video-generation
+  - videosys
 short_description: Text-to-Video
 disable_embedding: false
+---

app.py CHANGED Viewed

@@ -13,9 +13,9 @@ import spaces
 from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
-def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
     pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
-    config = CogVideoXConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
     engine = VideoSysEngine(config)
     return engine
@@ -50,15 +50,16 @@ def get_server_status():
     return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
-@spaces.GPU(duration=240)
-def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
-    engine = load_model()
     video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
     return video_path
-@spaces.GPU(duration=240)
 def generate_vs(
     prompt,
     num_inference_steps,
     guidance_scale,
@@ -69,7 +70,7 @@ def generate_vs(
 ):
     threshold = [int(threshold_end), int(threshold_start)]
     gap = int(gap)
-    engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
     video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
     return video_path
@@ -202,10 +203,14 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         with gr.Column():
-            prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=4)
             with gr.Column():
                 gr.Markdown("**Generation Parameters**<br>")
                 with gr.Row():
                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
                     guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
@@ -240,7 +245,7 @@ with gr.Blocks(css=css) as demo:
     generate_button.click(
         generate_vanilla,
-        inputs=[prompt, num_inference_steps, guidance_scale],
         outputs=[video_output],
         concurrency_id="gen",
         concurrency_limit=1,
@@ -248,7 +253,15 @@ with gr.Blocks(css=css) as demo:
     generate_button_vs.click(
         generate_vs,
-        inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold_start, pab_threshold_end, pab_range],
         outputs=[video_output_vs],
         concurrency_id="gen",
         concurrency_limit=1,

 from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
+def load_model(model_name, enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
     pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
+    config = CogVideoXConfig(model_name, enable_pab=enable_video_sys, pab_config=pab_config)
     engine = VideoSysEngine(config)
     return engine
     return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
+@spaces.GPU(duration=400)
+def generate_vanilla(model_name, prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
+    engine = load_model(model_name)
     video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
     return video_path
+@spaces.GPU(duration=360)
 def generate_vs(
+    model_name,
     prompt,
     num_inference_steps,
     guidance_scale,
 ):
     threshold = [int(threshold_end), int(threshold_start)]
     gap = int(gap)
+    engine = load_model(model_name, enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
     video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
     return video_path
     with gr.Row():
         with gr.Column():
+            prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=3)
             with gr.Column():
                 gr.Markdown("**Generation Parameters**<br>")
+                with gr.Row():
+                    model_name = gr.Dropdown(
+                        ["THUDM/CogVideoX-2b", "THUDM/CogVideoX-5b"], label="Model Type", value="THUDM/CogVideoX-2b"
+                    )
                 with gr.Row():
                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
                     guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
     generate_button.click(
         generate_vanilla,
+        inputs=[model_name, prompt, num_inference_steps, guidance_scale],
         outputs=[video_output],
         concurrency_id="gen",
         concurrency_limit=1,
     generate_button_vs.click(
         generate_vs,
+        inputs=[
+            model_name,
+            prompt,
+            num_inference_steps,
+            guidance_scale,
+            pab_threshold_start,
+            pab_threshold_end,
+            pab_range,
+        ],
         outputs=[video_output_vs],
         concurrency_id="gen",
         concurrency_limit=1,

videosys/core/engine.py CHANGED Viewed

@@ -20,7 +20,7 @@ class VideoSysEngine:
         self._init_worker(config.pipeline_cls)
     def _init_worker(self, pipeline_cls):
-        world_size = self.config.world_size
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
             os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
@@ -68,7 +68,7 @@ class VideoSysEngine:
     # TODO: add more options here for pipeline, or wrap all options into config
     def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
-        videosys.initialize(rank=rank, world_size=self.config.world_size, init_method=distributed_init_method, seed=42)
         pipeline = pipeline_cls(self.config)
         return pipeline

         self._init_worker(config.pipeline_cls)
     def _init_worker(self, pipeline_cls):
+        world_size = self.config.num_gpus
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
             os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
     # TODO: add more options here for pipeline, or wrap all options into config
     def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
+        videosys.initialize(rank=rank, world_size=self.config.num_gpus, init_method=distributed_init_method, seed=42)
         pipeline = pipeline_cls(self.config)
         return pipeline

videosys/core/pab_mgr.py CHANGED Viewed

@@ -94,6 +94,7 @@ class PABManager:
     @staticmethod
     def _is_t_in_skip_config(all_timesteps, timestep, config):
         is_t_in_skip_config = False
         for key in config:
             if key not in all_timesteps:
                 continue

     @staticmethod
     def _is_t_in_skip_config(all_timesteps, timestep, config):
         is_t_in_skip_config = False
+        skip_range = None
         for key in config:
             if key not in all_timesteps:
                 continue

videosys/core/pipeline.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from abc import abstractmethod
 from dataclasses import dataclass
@@ -28,6 +29,23 @@ class VideoSysPipeline(DiffusionPipeline):
         """
         return self.generate(*args, **kwargs)
 @dataclass
 class VideoSysPipelineOutput(BaseOutput):

+import inspect
 from abc import abstractmethod
 from dataclasses import dataclass
         """
         return self.generate(*args, **kwargs)
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+        # modify: remove the config module from the expected modules
+        expected_modules = expected_modules - {"config"}
+        optional_names = list(optional_parameters)
+        for name in optional_names:
+            if name in cls._optional_components:
+                expected_modules.add(name)
+                optional_parameters.remove(name)
+        return expected_modules, optional_parameters
 @dataclass
 class VideoSysPipelineOutput(BaseOutput):

videosys/pipelines/cogvideox/pipeline_cogvideox.py CHANGED Viewed

@@ -46,30 +46,75 @@ class CogVideoXPABConfig(PABConfig):
 class CogVideoXConfig:
     def __init__(
         self,
         model_path: str = "THUDM/CogVideoX-2b",
-        world_size: int = 1,
         vae_tiling: bool = True,
         enable_pab: bool = False,
         pab_config=CogVideoXPABConfig(),
     ):
-        # ======= engine ========
-        self.world_size = world_size
-        # ======= pipeline ========
         self.pipeline_cls = CogVideoXPipeline
         self.vae_tiling = vae_tiling
-        # ======= model ========
-        self.model_path = model_path
         self.enable_pab = enable_pab
         self.pab_config = pab_config
 class CogVideoXPipeline(VideoSysPipeline):
-    _optional_components = []
     model_cpu_offload_seq = "text_encoder->transformer->vae"
     _callback_tensor_inputs = [
         "latents",
@@ -86,11 +131,13 @@ class CogVideoXPipeline(VideoSysPipeline):
         transformer: Optional[CogVideoXTransformer3DModel] = None,
         scheduler: Optional[CogVideoXDDIMScheduler] = None,
         device: torch.device = torch.device("cuda"),
-        dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
         self._config = config
         self._device = device
         self._dtype = dtype
         if transformer is None:
@@ -99,8 +146,6 @@ class CogVideoXPipeline(VideoSysPipeline):
             )
         if vae is None:
             vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
-            if config.vae_tiling:
-                vae.enable_tiling(tile_sample_min_height=vae.tile_sample_min_height // 2)
         if tokenizer is None:
             tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
         if text_encoder is None:
@@ -120,6 +165,14 @@ class CogVideoXPipeline(VideoSysPipeline):
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
         # pab
         if config.enable_pab:
             set_pab_manager(config.pab_config)

 class CogVideoXConfig:
+    """
+    This config is to instantiate a `CogVideoXPipeline` class for video generation.
+    To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
+    In the engine, it will be used to instantiate the corresponding pipeline class.
+    And the engine will call the `generate` function of the pipeline to generate the video.
+    If you want to explore the detail of generation, please refer to the pipeline class below.
+    Args:
+        model_path (str):
+            A path to the pretrained pipeline. Defaults to "THUDM/CogVideoX-2b".
+        num_gpus (int):
+            The number of GPUs to use. Defaults to 1.
+        cpu_offload (bool):
+            Whether to enable CPU offload. Defaults to False.
+        vae_tiling (bool):
+            Whether to enable tiling for the VAE. Defaults to True.
+        enable_pab (bool):
+            Whether to enable Pyramid Attention Broadcast. Defaults to False.
+        pab_config (CogVideoXPABConfig):
+            The configuration for Pyramid Attention Broadcast. Defaults to `CogVideoXPABConfig()`.
+    Examples:
+        ```python
+        from videosys import CogVideoXConfig, VideoSysEngine
+        # models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
+        # change num_gpus for multi-gpu inference
+        config = CogVideoXConfig("THUDM/CogVideoX-2b", num_gpus=1)
+        engine = VideoSysEngine(config)
+        prompt = "Sunset over the sea."
+        # num frames should be <= 49. resolution is fixed to 720p.
+        video = engine.generate(
+            prompt=prompt,
+            guidance_scale=6,
+            num_inference_steps=50,
+            num_frames=49,
+        ).video[0]
+        engine.save_video(video, f"./outputs/{prompt}.mp4")
+        ```
+    """
     def __init__(
         self,
         model_path: str = "THUDM/CogVideoX-2b",
+        # ======= distributed ========
+        num_gpus: int = 1,
+        # ======= memory =======
+        cpu_offload: bool = False,
         vae_tiling: bool = True,
+        # ======= pab ========
         enable_pab: bool = False,
         pab_config=CogVideoXPABConfig(),
     ):
+        self.model_path = model_path
         self.pipeline_cls = CogVideoXPipeline
+        # ======= distributed ========
+        self.num_gpus = num_gpus
+        # ======= memory ========
+        self.cpu_offload = cpu_offload
         self.vae_tiling = vae_tiling
+        # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config
 class CogVideoXPipeline(VideoSysPipeline):
+    _optional_components = ["tokenizer", "text_encoder", "vae", "transformer", "scheduler"]
     model_cpu_offload_seq = "text_encoder->transformer->vae"
     _callback_tensor_inputs = [
         "latents",
         transformer: Optional[CogVideoXTransformer3DModel] = None,
         scheduler: Optional[CogVideoXDDIMScheduler] = None,
         device: torch.device = torch.device("cuda"),
+        dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self._config = config
         self._device = device
+        if config.model_path == "THUDM/CogVideoX-2b":
+            dtype = torch.float16
         self._dtype = dtype
         if transformer is None:
             )
         if vae is None:
             vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
         if tokenizer is None:
             tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
         if text_encoder is None:
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
+        # cpu offload
+        if config.cpu_offload:
+            self.enable_model_cpu_offload()
+        # vae tiling
+        if config.vae_tiling:
+            vae.enable_tiling()
         # pab
         if config.enable_pab:
             set_pab_manager(config.pab_config)

videosys/pipelines/latte/pipeline_latte.py CHANGED Viewed

@@ -79,10 +79,59 @@ class LattePABConfig(PABConfig):
 class LatteConfig:
     def __init__(
         self,
         model_path: str = "maxin-cn/Latte-1",
-        world_size: int = 1,
         enable_vae_temporal_decoder: bool = True,
         # ======= scheduler ========
         beta_start: float = 0.0001,
@@ -93,22 +142,17 @@ class LatteConfig:
         enable_pab: bool = False,
         pab_config: PABConfig = LattePABConfig(),
     ):
-        # ======= engine ========
-        self.world_size = world_size
-        # ======= pipeline ========
-        self.pipeline_cls = LattePipeline
-        # ======= model ========
         self.model_path = model_path
         self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
         # ======= scheduler ========
         self.beta_start = beta_start
         self.beta_end = beta_end
         self.beta_schedule = beta_schedule
         self.variance_type = variance_type
         # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config

 class LatteConfig:
+    """
+    This config is to instantiate a `LattePipeline` class for video generation.
+    To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
+    In the engine, it will be used to instantiate the corresponding pipeline class.
+    And the engine will call the `generate` function of the pipeline to generate the video.
+    If you want to explore the detail of generation, please refer to the pipeline class below.
+    Args:
+        model_path (str):
+            A path to the pretrained pipeline. Defaults to "maxin-cn/Latte-1".
+        num_gpus (int):
+            The number of GPUs to use. Defaults to 1.
+        enable_vae_temporal_decoder (bool):
+            Whether to enable VAE Temporal Decoder. Defaults to True.
+        beta_start (float):
+            The initial value of beta for DDIM. Defaults to 0.0001.
+        beta_end (float):
+            The final value of beta for DDIM. Defaults to 0.02.
+        beta_schedule (str):
+            The schedule of beta for DDIM. Defaults to "linear".
+        variance_type (str):
+            The type of variance for DDIM. Defaults to "learned_range".
+        enable_pab (bool):
+            Whether to enable Pyramid Attention Broadcast. Defaults to False.
+        pab_config (CogVideoXPABConfig):
+            The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
+    Examples:
+        ```python
+        from videosys import LatteConfig, VideoSysEngine
+        # change num_gpus for multi-gpu inference
+        config = LatteConfig("maxin-cn/Latte-1", num_gpus=1)
+        engine = VideoSysEngine(config)
+        prompt = "Sunset over the sea."
+        # video size is fixed to 16 frames, 512x512.
+        video = engine.generate(
+            prompt=prompt,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+        ).video[0]
+        engine.save_video(video, f"./outputs/{prompt}.mp4")
+        ```
+    """
     def __init__(
         self,
         model_path: str = "maxin-cn/Latte-1",
+        # ======= distributed =======
+        num_gpus: int = 1,
+        # ======= vae ========
         enable_vae_temporal_decoder: bool = True,
         # ======= scheduler ========
         beta_start: float = 0.0001,
         enable_pab: bool = False,
         pab_config: PABConfig = LattePABConfig(),
     ):
         self.model_path = model_path
+        self.pipeline_cls = LattePipeline
+        # ======= distributed =======
+        self.num_gpus = num_gpus
+        # ======= vae ========
         self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
         # ======= scheduler ========
         self.beta_start = beta_start
         self.beta_end = beta_end
         self.beta_schedule = beta_schedule
         self.variance_type = variance_type
         # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config

videosys/pipelines/open_sora/pipeline_open_sora.py CHANGED Viewed

@@ -69,38 +69,91 @@ class OpenSoraPABConfig(PABConfig):
 class OpenSoraConfig:
     def __init__(
         self,
-        model_path: str = "hpcai-tech/OpenSora-STDiT-v3",
-        world_size: int = 1,
         vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
         text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
-        # ======= scheduler =======
         num_sampling_steps: int = 30,
         cfg_scale: float = 7.0,
-        # ======= vae ========
         tiling_size: int = 4,
-        # ======= pab ========
         enable_pab: bool = False,
         pab_config: PABConfig = OpenSoraPABConfig(),
     ):
-        # ======= engine ========
-        self.world_size = world_size
-        # ======= pipeline ========
         self.pipeline_cls = OpenSoraPipeline
-        self.transformer = model_path
         self.vae = vae
         self.text_encoder = text_encoder
-        # ======= scheduler ========
         self.num_sampling_steps = num_sampling_steps
         self.cfg_scale = cfg_scale
-        # ======= vae ========
         self.tiling_size = tiling_size
-        # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config
@@ -157,16 +210,15 @@ class OpenSoraPipeline(VideoSysPipeline):
             tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
         if vae is None:
             vae = OpenSoraVAE_V1_2(
-                from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
                 micro_frame_size=17,
                 micro_batch_size=config.tiling_size,
             ).to(dtype)
         if transformer is None:
             transformer = STDiT3_XL_2(
-                from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
                 qk_norm=True,
-                enable_flash_attn=True,
-                enable_layernorm_kernel=True,
                 in_channels=vae.out_channels,
                 caption_channels=text_encoder.config.d_model,
                 model_max_length=300,

 class OpenSoraConfig:
+    """
+    This config is to instantiate a `OpenSoraPipeline` class for video generation.
+    To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
+    In the engine, it will be used to instantiate the corresponding pipeline class.
+    And the engine will call the `generate` function of the pipeline to generate the video.
+    If you want to explore the detail of generation, please refer to the pipeline class below.
+    Args:
+        transformer (str):
+            The transformer model to use. Defaults to "hpcai-tech/OpenSora-STDiT-v3".
+        vae (str):
+            The VAE model to use. Defaults to "hpcai-tech/OpenSora-VAE-v1.2".
+        text_encoder (str):
+            The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
+        num_gpus (int):
+            The number of GPUs to use. Defaults to 1.
+        num_sampling_steps (int):
+            The number of sampling steps. Defaults to 30.
+        cfg_scale (float):
+            The configuration scale. Defaults to 7.0.
+        tiling_size (int):
+            The tiling size. Defaults to 4.
+        enable_flash_attn (bool):
+            Whether to enable Flash Attention. Defaults to False.
+        enable_pab (bool):
+            Whether to enable Pyramid Attention Broadcast. Defaults to False.
+        pab_config (CogVideoXPABConfig):
+            The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
+    Examples:
+        ```python
+        from videosys import OpenSoraConfig, VideoSysEngine
+        # change num_gpus for multi-gpu inference
+        # sampling parameters are defined in the config
+        config = OpenSoraConfig(num_sampling_steps=30, cfg_scale=7.0, num_gpus=1)
+        engine = VideoSysEngine(config)
+        prompt = "Sunset over the sea."
+        # num frames: 2s, 4s, 8s, 16s
+        # resolution: 144p, 240p, 360p, 480p, 720p
+        # aspect ratio: 9:16, 16:9, 3:4, 4:3, 1:1
+        video = engine.generate(
+            prompt=prompt,
+            resolution="480p",
+            aspect_ratio="9:16",
+            num_frames="2s",
+        ).video[0]
+        engine.save_video(video, f"./outputs/{prompt}.mp4")
+        ```
+    """
     def __init__(
         self,
+        transformer: str = "hpcai-tech/OpenSora-STDiT-v3",
         vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
         text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
+        # ======== distributed ========
+        num_gpus: int = 1,
+        # ======== scheduler ========
         num_sampling_steps: int = 30,
         cfg_scale: float = 7.0,
+        # ======== vae ========
         tiling_size: int = 4,
+        # ======== speedup ========
+        enable_flash_attn: bool = False,
+        # ======== pab ========
         enable_pab: bool = False,
         pab_config: PABConfig = OpenSoraPABConfig(),
     ):
         self.pipeline_cls = OpenSoraPipeline
+        self.transformer = transformer
         self.vae = vae
         self.text_encoder = text_encoder
+        # ======== distributed ========
+        self.num_gpus = num_gpus
+        # ======== scheduler ========
         self.num_sampling_steps = num_sampling_steps
         self.cfg_scale = cfg_scale
+        # ======== vae ========
         self.tiling_size = tiling_size
+        # ======== speedup ========
+        self.enable_flash_attn = enable_flash_attn
+        # ======== pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config
             tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
         if vae is None:
             vae = OpenSoraVAE_V1_2(
+                from_pretrained=config.vae,
                 micro_frame_size=17,
                 micro_batch_size=config.tiling_size,
             ).to(dtype)
         if transformer is None:
             transformer = STDiT3_XL_2(
+                from_pretrained=config.transformer,
                 qk_norm=True,
+                enable_flash_attn=config.enable_flash_attn,
                 in_channels=vae.out_channels,
                 caption_channels=text_encoder.config.d_model,
                 model_max_length=300,

videosys/pipelines/open_sora_plan/pipeline_open_sora_plan.py CHANGED Viewed

@@ -114,13 +114,61 @@ class OpenSoraPlanPABConfig(PABConfig):
 class OpenSoraPlanConfig:
     def __init__(
         self,
-        model_path: str = "LanguageBind/Open-Sora-Plan-v1.1.0",
-        world_size: int = 1,
-        num_frames: int = 65,
         ae: str = "CausalVAEModel_4x8x8",
         text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
         # ======= vae =======
         enable_tiling: bool = True,
         tile_overlap_factor: float = 0.25,
@@ -128,24 +176,18 @@ class OpenSoraPlanConfig:
         enable_pab: bool = False,
         pab_config: PABConfig = OpenSoraPlanPABConfig(),
     ):
-        # ======= engine ========
-        self.world_size = world_size
-        # ======= pipeline ========
         self.pipeline_cls = OpenSoraPlanPipeline
         self.ae = ae
         self.text_encoder = text_encoder
-        # ======= model ========
-        self.model_path = model_path
         assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
         self.num_frames = num_frames
         self.version = f"{num_frames}x512x512"
         # ======= vae ========
         self.enable_tiling = enable_tiling
         self.tile_overlap_factor = tile_overlap_factor
         # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config
@@ -200,9 +242,9 @@ class OpenSoraPlanPipeline(VideoSysPipeline):
         if text_encoder is None:
             text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
         if vae is None:
-            vae = getae_wrapper(config.ae)(config.model_path, subfolder="vae").to(dtype=dtype)
         if transformer is None:
-            transformer = LatteT2V.from_pretrained(config.model_path, subfolder=config.version, torch_dtype=dtype)
         if scheduler is None:
             scheduler = PNDMScheduler()

 class OpenSoraPlanConfig:
+    """
+    This config is to instantiate a `OpenSoraPlanPipeline` class for video generation.
+    To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
+    In the engine, it will be used to instantiate the corresponding pipeline class.
+    And the engine will call the `generate` function of the pipeline to generate the video.
+    If you want to explore the detail of generation, please refer to the pipeline class below.
+    Args:
+        transformer (str):
+            The transformer model to use. Defaults to "LanguageBind/Open-Sora-Plan-v1.1.0".
+        ae (str):
+            The Autoencoder model to use. Defaults to "CausalVAEModel_4x8x8".
+        text_encoder (str):
+            The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
+        num_frames (int):
+            The number of frames to generate. Must be one of [65, 221].
+        num_gpus (int):
+            The number of GPUs to use. Defaults to 1.
+        enable_tiling (bool):
+            Whether to enable tiling. Defaults to True.
+        tile_overlap_factor (float):
+            The overlap factor for tiling. Defaults to 0.25.
+        enable_pab (bool):
+            Whether to enable Pyramid Attention Broadcast. Defaults to False.
+        pab_config (CogVideoXPABConfig):
+            The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
+    Examples:
+        ```python
+        from videosys import OpenSoraPlanConfig, VideoSysEngine
+        # num frames: 65 or 221
+        # change num_gpus for multi-gpu inference
+        config = OpenSoraPlanConfig(num_frames=65, num_gpus=1)
+        engine = VideoSysEngine(config)
+        prompt = "Sunset over the sea."
+        video = engine.generate(
+            prompt=prompt,
+            guidance_scale=7.5,
+            num_inference_steps=150,
+        ).video[0]
+        engine.save_video(video, f"./outputs/{prompt}.mp4")
+        ```
+    """
     def __init__(
         self,
+        transformer: str = "LanguageBind/Open-Sora-Plan-v1.1.0",
         ae: str = "CausalVAEModel_4x8x8",
         text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
+        num_frames: int = 65,
+        # ======= distributed ========
+        num_gpus: int = 1,
         # ======= vae =======
         enable_tiling: bool = True,
         tile_overlap_factor: float = 0.25,
         enable_pab: bool = False,
         pab_config: PABConfig = OpenSoraPlanPABConfig(),
     ):
         self.pipeline_cls = OpenSoraPlanPipeline
         self.ae = ae
         self.text_encoder = text_encoder
+        self.transformer = transformer
         assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
         self.num_frames = num_frames
         self.version = f"{num_frames}x512x512"
+        # ======= distributed ========
+        self.num_gpus = num_gpus
         # ======= vae ========
         self.enable_tiling = enable_tiling
         self.tile_overlap_factor = tile_overlap_factor
         # ======= pab ========
         self.enable_pab = enable_pab
         self.pab_config = pab_config
         if text_encoder is None:
             text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
         if vae is None:
+            vae = getae_wrapper(config.ae)(config.transformer, subfolder="vae").to(dtype=dtype)
         if transformer is None:
+            transformer = LatteT2V.from_pretrained(config.transformer, subfolder=config.version, torch_dtype=dtype)
         if scheduler is None:
             scheduler = PNDMScheduler()

videosys/utils/utils.py CHANGED Viewed

@@ -76,7 +76,7 @@ def save_video(video, output_path, fps):
     """
     Save a video to disk.
     """
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    if dist.get_rank() == 0:
-        imageio.mimwrite(output_path, video, fps=fps)
-    dist.barrier()

     """
     Save a video to disk.
     """
+    if dist.is_initialized() and dist.get_rank() != 0:
+        return
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    imageio.mimwrite(output_path, video, fps=fps)