KoolCogVideoX

Running

App Files Files Community

oahzxl commited on Aug 28, 2024

Commit

ab7be96

1 Parent(s): 654833d

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +167 -0
.isort.cfg +7 -0
.pre-commit-config.yaml +39 -0
app.py +102 -205
docs/dsp.md +0 -25
docs/pab.md +0 -121
eval/pab/commom_metrics/README.md +0 -6
eval/pab/commom_metrics/calculate_lpips.py +0 -97
eval/pab/commom_metrics/calculate_psnr.py +0 -90
eval/pab/commom_metrics/calculate_ssim.py +0 -116
eval/pab/commom_metrics/eval.py +0 -160
eval/pab/experiments/attention_ablation.py +0 -60
eval/pab/experiments/components_ablation.py +0 -46
eval/pab/experiments/latte.py +0 -57
eval/pab/experiments/opensora.py +0 -44
eval/pab/experiments/opensora_plan.py +0 -57
eval/pab/experiments/utils.py +0 -22
eval/pab/vbench/VBench_full_info.json +0 -0
eval/pab/vbench/cal_vbench.py +0 -154
eval/pab/vbench/run_vbench.py +0 -52
examples/cogvideo/sample.py +0 -14
examples/latte/sample.py +0 -24
examples/open_sora/sample.py +0 -24
examples/open_sora_plan/sample.py +0 -24
videosys/__init__.py +9 -13
videosys/core/engine.py +2 -4
videosys/core/pab_mgr.py +43 -175
videosys/datasets/dataloader.py +0 -94
videosys/datasets/image_transform.py +0 -42
videosys/datasets/video_transform.py +0 -441
videosys/diffusion/__init__.py +0 -41
videosys/diffusion/diffusion_utils.py +0 -79
videosys/diffusion/gaussian_diffusion.py +0 -829
videosys/diffusion/respace.py +0 -119
videosys/diffusion/timestep_sampler.py +0 -143
{eval/pab/commom_metrics → videosys/models/autoencoders}/__init__.py +0 -0
videosys/models/{cogvideo/autoencoder_kl.py → autoencoders/autoencoder_kl_cogvideox.py} +328 -94
videosys/models/{open_sora/vae.py → autoencoders/autoencoder_kl_open_sora.py} +2 -9
videosys/models/{open_sora_plan/ae.py → autoencoders/autoencoder_kl_open_sora_plan.py} +797 -14
videosys/models/cogvideo/__init__.py +0 -6
videosys/models/cogvideo/modules.py +0 -317
videosys/models/cogvideo/retrieve_timesteps.py +0 -74
videosys/models/latte/__init__.py +0 -7
{eval/pab/experiments → videosys/models/modules}/__init__.py +0 -0
videosys/models/modules/activations.py +3 -0
videosys/{modules/attn.py → models/modules/attentions.py} +45 -131
videosys/models/modules/downsampling.py +71 -0
videosys/models/{open_sora/modules.py → modules/embeddings.py} +171 -209
videosys/models/modules/normalization.py +102 -0
videosys/models/modules/upsampling.py +67 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+outputs/
+processed/
+profile/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+docs/.build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# IDE
+.idea/
+.vscode/
+# macos
+*.DS_Store
+#data/
+docs/.build
+# pytorch checkpoint
+*.pt
+# ignore any kernel build files
+.o
+.so
+# ignore python interface defition file
+.pyi
+# ignore coverage test file
+coverage.lcov
+coverage.xml
+# ignore testmon and coverage files
+.coverage
+.testmondata*
+pretrained
+samples
+cache_dir
+test_outputs

.isort.cfg ADDED Viewed

	@@ -0,0 +1,7 @@

+[settings]
+line_length = 120
+multi_line_output=3
+include_trailing_comma = true
+ignore_comments = true
+profile = black
+honor_noqa = true

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.1
+    hooks:
+      - id: autoflake
+        name: autoflake (python)
+        args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: sort all imports (python)
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+    - id: black
+      name: black formatter
+      args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v13.0.1
+    hooks:
+    - id: clang-format
+      name: clang formatter
+      types_or: [c++, c]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ['--fix=lf']

app.py CHANGED Viewed

@@ -2,131 +2,107 @@ import os
 os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.getcwd(), ".tmp_outputs")
-import torch
-from openai import OpenAI
-from time import time
-import tempfile
-import uuid
 import logging
 import gradio as gr
-from videosys import CogVideoConfig, VideoSysEngine
-from videosys.models.cogvideo.pipeline import CogVideoPABConfig
 import psutil
-import GPUtil
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-dtype = torch.bfloat16
-sys_prompt = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.
-For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
-There are a few rules to follow:
-You will only ever output a single video description per user request.
-When modifications are requested , you should not simply make the description longer . You should refactor the entire description to integrate the suggestions.
-Other times the user will not want modifications , but instead want a new image . In this case , you should ignore your previous conversation with the user.
-Video descriptions must have the same num of words as examples below. Extra words will be ignored.
-"""
-def convert_prompt(prompt: str, retry_times: int = 3) -> str:
-    if not os.environ.get("OPENAI_API_KEY"):
-        return prompt
-    client = OpenAI()
-    text = prompt.strip()
-    for i in range(retry_times):
-        response = client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": sys_prompt},
-                {
-                    "role": "user",
-                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "a girl is on the beach"',
-                },
-                {
-                    "role": "assistant",
-                    "content": "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance.",
-                },
-                {
-                    "role": "user",
-                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A man jogging on a football field"',
-                },
-                {
-                    "role": "assistant",
-                    "content": "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, jogs around a snow-covered soccer field, showcasing his solitary exercise in a quiet, overcast setting. His long dreadlocks, focused expression, and the serene winter backdrop highlight his dedication to fitness. As he moves, his attire, consisting of a blue sports sweatshirt, black athletic pants, gloves, and sneakers, grips the snowy ground. He is seen running past a chain-link fence enclosing the playground area, with a basketball hoop and children's slide, suggesting a moment of solitary exercise amidst the empty field.",
-                },
-                {
-                    "role": "user",
-                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : " A woman is dancing, HD footage, close-up"',
-                },
-                {
-                    "role": "assistant",
-                    "content": "A young woman with her hair in an updo and wearing a teal hoodie stands against a light backdrop, initially looking over her shoulder with a contemplative expression. She then confidently makes a subtle dance move, suggesting rhythm and movement. Next, she appears poised and focused, looking directly at the camera. Her expression shifts to one of introspection as she gazes downward slightly. Finally, she dances with confidence, her left hand over her heart, symbolizing a poignant moment, all while dressed in the same teal hoodie against a plain, light-colored background.",
-                },
-                {
-                    "role": "user",
-                    "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
-                },
-            ],
-            model="glm-4-0520",
-            temperature=0.01,
-            top_p=0.7,
-            stream=False,
-            max_tokens=250,
-        )
-        if response.choices:
-            return response.choices[0].message.content
-    return prompt
-def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_gap=2):
-    pab_config = CogVideoPABConfig(full_threshold=pab_threshold, full_gap=pab_gap)
-    config = CogVideoConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
     engine = VideoSysEngine(config)
     return engine
 def generate(engine, prompt, num_inference_steps=50, guidance_scale=6.0):
-    try:
-        video = engine.generate(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).video[0]
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
-            temp_file.name
-            unique_filename = f"{uuid.uuid4().hex}.mp4"
-            output_path = os.path.join("./temp_outputs", unique_filename)
-            engine.save_video(video, output_path)
-        return output_path
-    except Exception as e:
-        logger.error(f"An error occurred: {str(e)}")
-        return None
 def get_server_status():
     cpu_percent = psutil.cpu_percent()
     memory = psutil.virtual_memory()
-    disk = psutil.disk_usage('/')
     gpus = GPUtil.getGPUs()
     gpu_info = []
     for gpu in gpus:
-        gpu_info.append({
-            'id': gpu.id,
-            'name': gpu.name,
-            'load': f"{gpu.load*100:.1f}%",
-            'memory_used': f"{gpu.memoryUsed}MB",
-            'memory_total': f"{gpu.memoryTotal}MB"
-        })
     return {
-        'cpu': f"{cpu_percent}%",
-        'memory': f"{memory.percent}%",
-        'disk': f"{disk.percent}%",
-        'gpu': gpu_info
     }
 css = """
 body {
@@ -137,16 +113,17 @@ body {
     padding: 20px;
 }
 .container {
     display: flex;
     flex-direction: column;
-    gap: 20px;
 }
 .row {
     display: flex;
     flex-wrap: wrap;
-    gap: 18px;
 }
 .column {
@@ -186,12 +163,6 @@ body {
     font-size: 0.9em !important;
     line-height: 1.2 !important;
 }
-.server-status button {
-    padding: 1px 8px !important;
-    height: 22px !important;
-    font-size: 0.9em !important;
-    margin-top: 2px !important;
-}
 .server-status .textbox {
     gap: 0 !important;
 }
@@ -215,150 +186,76 @@ body {
 """
 with gr.Blocks(css=css) as demo:
-    gr.HTML("""
     <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
-        VideoSys Huggingface Space🤗
     </div>
     <div style="text-align: center; font-size: 15px;">
         🌐 Github: <a href="https://github.com/NUS-HPC-AI-Lab/VideoSys">https://github.com/NUS-HPC-AI-Lab/VideoSys</a><br>
-        ⚠️ This demo is for academic research and experiential use only.
         Users should strictly adhere to local laws and ethics.<br>
         💡 This demo only demonstrates single-device inference. To experience the full power of VideoSys, please deploy it with multiple devices.<br><br>
         </div>
     </div>
-    """)
     with gr.Row():
         with gr.Column():
-            prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=5)
-            with gr.Row():
-                gr.Markdown(
-                    "✨Upon pressing the enhanced prompt button, we will use [GLM-4 Model](https://github.com/THUDM/GLM-4) to polish the prompt and overwrite the original one."
-                )
-                enhance_button = gr.Button("✨ Enhance Prompt(Optional)")
             with gr.Column():
-                gr.Markdown(
-                    "**Optional Parameters** (default values are recommended)<br>"
-                    "Turn Inference Steps larger if you want more detailed video, but it will be slower.<br>"
-                    "50 steps are recommended for most cases. will cause 120 seconds for inference.<br>"
-                )
                 with gr.Row():
                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
                     guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
-                    pab_gap = gr.Number(label="PAB Gap", value=2, precision=0)
-                    pab_threshold = gr.Textbox(label="PAB Threshold", value="100,850", lines=1)
                 with gr.Row():
-                    generate_button = gr.Button("🎬 Generate Video")
                     generate_button_vs = gr.Button("⚡️ Generate Video with VideoSys (Faster)")
                 with gr.Column(elem_classes="server-status"):
                     gr.Markdown("#### Server Status")
                     with gr.Row():
                         cpu_status = gr.Textbox(label="CPU", scale=1)
                         memory_status = gr.Textbox(label="Memory", scale=1)
                     with gr.Row():
                         disk_status = gr.Textbox(label="Disk", scale=1)
                         gpu_status = gr.Textbox(label="GPU Memory", scale=1)
                     with gr.Row():
-                        refresh_button = gr.Button("Refresh", size="sm")
         with gr.Column():
-            with gr.Row():
-                video_output = gr.Video(label="CogVideoX", width=720, height=480)
-            with gr.Row():
-                download_video_button = gr.File(label="📥 Download Video", visible=False)
-                elapsed_time = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
             with gr.Row():
                 video_output_vs = gr.Video(label="CogVideoX with VideoSys", width=720, height=480)
             with gr.Row():
-                download_video_button_vs = gr.File(label="📥 Download Video", visible=False)
-                elapsed_time_vs = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
-        # with gr.Column():
-        #     task_status = gr.Textbox(label="任务状态", visible=False)
-    def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
-        engine = load_model()
-        t = time()
-        video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
-        elapsed_time = time() - t
-        video_update = gr.update(visible=True, value=video_path)
-        elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
-        return video_path, video_update, elapsed_time
-    def generate_vs(prompt, num_inference_steps, guidance_scale, threshold, gap, progress=gr.Progress(track_tqdm=True)):
-        threshold = [int(i) for i in threshold.split(",")]
-        gap = int(gap)
-        engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_gap=gap)
-        t = time()
-        video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
-        elapsed_time = time() - t
-        video_update = gr.update(visible=True, value=video_path)
-        elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
-        return video_path, video_update, elapsed_time
-    def enhance_prompt_func(prompt):
-        return convert_prompt(prompt, retry_times=1)
-    def get_server_status():
-        cpu_percent = psutil.cpu_percent()
-        memory = psutil.virtual_memory()
-        disk = psutil.disk_usage('/')
-        try:
-            gpus = GPUtil.getGPUs()
-            if gpus:
-                gpu = gpus[0]
-                gpu_memory = f"{gpu.memoryUsed}/{gpu.memoryTotal}MB ({gpu.memoryUtil*100:.1f}%)"
-            else:
-                gpu_memory = "No GPU found"
-        except:
-            gpu_memory = "GPU information unavailable"
-        return {
-            'cpu': f"{cpu_percent}%",
-            'memory': f"{memory.percent}%",
-            'disk': f"{disk.percent}%",
-            'gpu_memory': gpu_memory
-        }
-    def update_server_status():
-        status = get_server_status()
-        return (
-            status['cpu'],
-            status['memory'],
-            status['disk'],
-            status['gpu_memory']
-        )
     generate_button.click(
         generate_vanilla,
         inputs=[prompt, num_inference_steps, guidance_scale],
-        outputs=[video_output, download_video_button, elapsed_time],
     )
     generate_button_vs.click(
         generate_vs,
-        inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold, pab_gap],
-        outputs=[video_output_vs, download_video_button_vs, elapsed_time_vs],
     )
-    enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
     refresh_button.click(update_server_status, outputs=[cpu_status, memory_status, disk_status, gpu_status])
     demo.load(update_server_status, outputs=[cpu_status, memory_status, disk_status, gpu_status], every=1)
 if __name__ == "__main__":
     demo.queue(max_size=10, default_concurrency_limit=1)
-    demo.launch()

 os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.getcwd(), ".tmp_outputs")
 import logging
+import uuid
+import GPUtil
 import gradio as gr
 import psutil
+import torch
+from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+dtype = torch.float16
+def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
+    pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
+    config = CogVideoXConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
     engine = VideoSysEngine(config)
     return engine
 def generate(engine, prompt, num_inference_steps=50, guidance_scale=6.0):
+    video = engine.generate(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).video[0]
+    unique_filename = f"{uuid.uuid4().hex}.mp4"
+    output_path = os.path.join("./.tmp_outputs", unique_filename)
+    engine.save_video(video, output_path)
+    return output_path
 def get_server_status():
     cpu_percent = psutil.cpu_percent()
     memory = psutil.virtual_memory()
+    disk = psutil.disk_usage("/")
     gpus = GPUtil.getGPUs()
     gpu_info = []
     for gpu in gpus:
+        gpu_info.append(
+            {
+                "id": gpu.id,
+                "name": gpu.name,
+                "load": f"{gpu.load*100:.1f}%",
+                "memory_used": f"{gpu.memoryUsed}MB",
+                "memory_total": f"{gpu.memoryTotal}MB",
+            }
+        )
+    return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
+def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
+    engine = load_model()
+    video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+    return video_path
+def generate_vs(
+    prompt,
+    num_inference_steps,
+    guidance_scale,
+    threshold_start,
+    threshold_end,
+    gap,
+    progress=gr.Progress(track_tqdm=True),
+):
+    threshold = [int(threshold_end), int(threshold_start)]
+    gap = int(gap)
+    engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
+    video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+    return video_path
+def get_server_status():
+    cpu_percent = psutil.cpu_percent()
+    memory = psutil.virtual_memory()
+    disk = psutil.disk_usage("/")
+    try:
+        gpus = GPUtil.getGPUs()
+        if gpus:
+            gpu = gpus[0]
+            gpu_memory = f"{gpu.memoryUsed}/{gpu.memoryTotal}MB ({gpu.memoryUtil*100:.1f}%)"
+        else:
+            gpu_memory = "No GPU found"
+    except:
+        gpu_memory = "GPU information unavailable"
     return {
+        "cpu": f"{cpu_percent}%",
+        "memory": f"{memory.percent}%",
+        "disk": f"{disk.percent}%",
+        "gpu_memory": gpu_memory,
     }
+def update_server_status():
+    status = get_server_status()
+    return (status["cpu"], status["memory"], status["disk"], status["gpu_memory"])
 css = """
 body {
     padding: 20px;
 }
 .container {
     display: flex;
     flex-direction: column;
+    gap: 10px;
 }
 .row {
     display: flex;
     flex-wrap: wrap;
+    gap: 10px;
 }
 .column {
     font-size: 0.9em !important;
     line-height: 1.2 !important;
 }
 .server-status .textbox {
     gap: 0 !important;
 }
 """
 with gr.Blocks(css=css) as demo:
+    gr.HTML(
+        """
     <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
+        VideoSys for CogVideoX🤗
     </div>
     <div style="text-align: center; font-size: 15px;">
         🌐 Github: <a href="https://github.com/NUS-HPC-AI-Lab/VideoSys">https://github.com/NUS-HPC-AI-Lab/VideoSys</a><br>
+        ⚠️ This demo is for academic research and experiential use only.
         Users should strictly adhere to local laws and ethics.<br>
         💡 This demo only demonstrates single-device inference. To experience the full power of VideoSys, please deploy it with multiple devices.<br><br>
         </div>
     </div>
+    """
+    )
     with gr.Row():
         with gr.Column():
+            prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=4)
             with gr.Column():
+                gr.Markdown("**Generation Parameters**<br>")
                 with gr.Row():
                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
                     guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
                 with gr.Row():
+                    pab_range = gr.Number(
+                        label="PAB Broadcast Range", value=2, precision=0, info="Broadcast timesteps range."
+                    )
+                    pab_threshold_start = gr.Number(label="PAB Start Timestep", value=850, info="Start from step 1000.")
+                    pab_threshold_end = gr.Number(label="PAB End Timestep", value=100, info="End at step 0.")
+                with gr.Row():
                     generate_button_vs = gr.Button("⚡️ Generate Video with VideoSys (Faster)")
+                    generate_button = gr.Button("🎬 Generate Video (Original)")
                 with gr.Column(elem_classes="server-status"):
                     gr.Markdown("#### Server Status")
                     with gr.Row():
                         cpu_status = gr.Textbox(label="CPU", scale=1)
                         memory_status = gr.Textbox(label="Memory", scale=1)
                     with gr.Row():
                         disk_status = gr.Textbox(label="Disk", scale=1)
                         gpu_status = gr.Textbox(label="GPU Memory", scale=1)
                     with gr.Row():
+                        refresh_button = gr.Button("Refresh")
         with gr.Column():
             with gr.Row():
                 video_output_vs = gr.Video(label="CogVideoX with VideoSys", width=720, height=480)
             with gr.Row():
+                video_output = gr.Video(label="CogVideoX", width=720, height=480)
     generate_button.click(
         generate_vanilla,
         inputs=[prompt, num_inference_steps, guidance_scale],
+        outputs=[video_output],
     )
     generate_button_vs.click(
         generate_vs,
+        inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold_start, pab_threshold_end, pab_range],
+        outputs=[video_output_vs],
     )
     refresh_button.click(update_server_status, outputs=[cpu_status, memory_status, disk_status, gpu_status])
     demo.load(update_server_status, outputs=[cpu_status, memory_status, disk_status, gpu_status], every=1)
 if __name__ == "__main__":
     demo.queue(max_size=10, default_concurrency_limit=1)
+    demo.launch()

docs/dsp.md DELETED Viewed

@@ -1,25 +0,0 @@
-# DSP
-paper: https://arxiv.org/abs/2403.10266
-![dsp_overview](../assets/figures/dsp_overview.png)
-DSP (Dynamic Sequence Parallelism) is a novel, elegant and super efficient sequence parallelism for [OpenSora](https://github.com/hpcaitech/Open-Sora), [Latte](https://github.com/Vchitect/Latte) and other multi-dimensional transformer architecture.
-The key idea is to dynamically switch the parallelism dimension according to the current computation stage, leveraging the potential characteristics of multi-dimensional transformers. Compared with splitting head and sequence dimension as previous methods, it can reduce at least 75% of communication cost.
-It achieves **3x** speed for training and **2x** speed for inference in OpenSora compared with sota sequence parallelism ([DeepSpeed Ulysses](https://arxiv.org/abs/2309.14509)). For a 10s (80 frames) of 512x512 video, the inference latency of OpenSora is:
-| Method | 1xH800 | 8xH800 (DS Ulysses) | 8xH800 (DSP) |
-| ------ | ------ | ------ | ------ |
-| Latency(s) | 106 | 45 | 22 |
-The following is DSP's end-to-end throughput for training of OpenSora:
-![dsp_overview](../assets/figures/dsp_exp.png)
-### Usage
-DSP is currently supported for: OpenSora, OpenSoraPlan and Latte. To enable DSP, you just need to launch with multiple GPUs.

docs/pab.md DELETED Viewed

@@ -1,121 +0,0 @@
-# Pyramid Attention Broadcast(PAB)
-[[paper](https://arxiv.org/abs/2408.12588)][[blog](https://arxiv.org/abs/2403.10266)]
-Pyramid Attention Broadcast(PAB)(#pyramid-attention-broadcastpab)
-- [Pyramid Attention Broadcast(PAB)](#pyramid-attention-broadcastpab)
-  - [Insights](#insights)
-  - [Pyramid Attention Broadcast (PAB) Mechanism](#pyramid-attention-broadcast-pab-mechanism)
-  - [Experimental Results](#experimental-results)
-  - [Usage](#usage)
-    - [Supported Models](#supported-models)
-    - [Configuration for PAB](#configuration-for-pab)
-      - [Parameters](#parameters)
-      - [Example Configuration](#example-configuration)
-We introduce Pyramid Attention Broadcast (PAB), the first approach that achieves real-time DiT-based video generation. By mitigating redundant attention computation, PAB achieves up to 21.6 FPS with 10.6x acceleration, without sacrificing quality across popular DiT-based video generation models including Open-Sora, Open-Sora-Plan, and Latte. Notably, as a training-free approach, PAB can enpower any future DiT-based video generation models with real-time capabilities.
-## Insights
-![method](../assets/figures/pab_motivation.png)
-Our study reveals two key insights of three **attention mechanisms** within video diffusion transformers:
-- First, attention differences across time steps exhibit a U-shaped pattern, with significant variations occurring during the first and last 15% of steps, while the middle 70% of steps show very stable, minor differences.
-- Second, within the stable middle segment, the variability differs among attention types:
-    - **Spatial attention** varies the most, involving high-frequency elements like edges and textures;
-    - **Temporal attention** exhibits mid-frequency variations related to movements and dynamics in videos;
-    - **Cross-modal attention** is the most stable, linking text with video content, analogous to low-frequency signals reflecting textual semantics.
-## Pyramid Attention Broadcast (PAB) Mechanism
-![method](../assets/figures/pab_method.png)
-Building on these insights, we propose a **pyramid attention broadcast(PAB)** mechanism to minimize unnecessary computations and optimize the utility of each attention module, as shown in Figure[xx figure] below.
-In the middle segment, we broadcast one step's attention outputs to its subsequent several steps, thereby significantly reducing the computational cost on attention modules.
-For more efficient broadcast and minimum influence to effect, we set varied broadcast ranges for different attentions based on their stability and differences.
-**The smaller the variation in attention, the broader the potential broadcast range.**
-## Experimental Results
-Here are the results of our experiments, more results are shown in https://oahzxl.github.io/PAB:
-![pab_vis](../assets/figures/pab_vis.png)
-## Usage
-### Supported Models
-PAB currently supports Open-Sora, Open-Sora-Plan, and Latte.
-### Configuration for PAB
-To efficiently use the Pyramid Attention Broadcast (PAB) mechanism, configure the following parameters to control the broadcasting for different attention types. This helps reduce computational costs by skipping certain steps based on attention stability.
-#### Parameters
-- **spatial_broadcast**: Enable or disable broadcasting for spatial attention.
-  - Type: `True` or `False`
-- **spatial_threshold**: Set the range of diffusion steps within which spatial attention is applied.
-  - Format: `[min_value, max_value]`
-- **spatial_gap**: Number of blocks in model to skip during broadcasting for spatial attention.
-  - Type: Integer
-- **temporal_broadcast**: Enable or disable broadcasting for temporal attention.
-  - Type: `True` or `False`
-- **temporal_threshold**: Set the range of diffusion steps within which temporal attention is applied.
-  - Format: `[min_value, max_value]`
-- **temporal_gap**: Number of steps to skip during broadcasting for temporal attention.
-  - Type: Integer
-- **cross_broadcast**: Enable or disable broadcasting for cross-modal attention.
-  - Type: `True` or `False`
-- **cross_threshold**: Set the range of diffusion steps within which cross-modal attention is applied.
-  - Format: `[min_value, max_value]`
-- **cross_gap**: Number of steps to skip during broadcasting for cross-modal attention.
-  - Type: Integer
-#### Example Configuration
-```yaml
-spatial_broadcast: True
-spatial_threshold: [100, 800]
-spatial_gap: 2
-temporal_broadcast: True
-temporal_threshold: [100, 800]
-temporal_gap: 3
-cross_broadcast: True
-cross_threshold: [100, 900]
-cross_gap: 5
-```
-Explanation:
-- **Spatial Attention**:
-  - Broadcasting enabled (`spatial_broadcast: True`)
-  - Applied within the threshold range of 100 to 800
-  - Skips every 2 steps (`spatial_gap: 2`)
-  - Active within the first 28 steps (`spatial_block: [0, 28]`)
-- **Temporal Attention**:
-  - Broadcasting enabled (`temporal_broadcast: True`)
-  - Applied within the threshold range of 100 to 800
-  - Skips every 3 steps (`temporal_gap: 3`)
-- **Cross-Modal Attention**:
-  - Broadcasting enabled (`cross_broadcast: True`)
-  - Applied within the threshold range of 100 to 900
-  - Skips every 5 steps (`cross_gap: 5`)
-Adjust these settings based on your specific needs to optimize the performance of each attention mechanism.

eval/pab/commom_metrics/README.md DELETED Viewed

@@ -1,6 +0,0 @@
-Common metrics
-Include LPIPS, PSNR and SSIM.
-The code is adapted from [common_metrics_on_video_quality
-](https://github.com/JunyaoHu/common_metrics_on_video_quality).

eval/pab/commom_metrics/calculate_lpips.py DELETED Viewed

@@ -1,97 +0,0 @@
-import lpips
-import numpy as np
-import torch
-spatial = True  # Return a spatial map of perceptual distance.
-# Linearly calibrated models (LPIPS)
-loss_fn = lpips.LPIPS(net="alex", spatial=spatial)  # Can also set net = 'squeeze' or 'vgg'
-# loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
-def trans(x):
-    # if greyscale images add channel
-    if x.shape[-3] == 1:
-        x = x.repeat(1, 1, 3, 1, 1)
-    # value range [0, 1] -> [-1, 1]
-    x = x * 2 - 1
-    return x
-def calculate_lpips(videos1, videos2, device):
-    # image should be RGB, IMPORTANT: normalized to [-1,1]
-    assert videos1.shape == videos2.shape
-    # videos [batch_size, timestamps, channel, h, w]
-    # support grayscale input, if grayscale -> channel*3
-    # value range [0, 1] -> [-1, 1]
-    videos1 = trans(videos1)
-    videos2 = trans(videos2)
-    lpips_results = []
-    for video_num in range(videos1.shape[0]):
-        # get a video
-        # video [timestamps, channel, h, w]
-        video1 = videos1[video_num]
-        video2 = videos2[video_num]
-        lpips_results_of_a_video = []
-        for clip_timestamp in range(len(video1)):
-            # get a img
-            # img [timestamps[x], channel, h, w]
-            # img [channel, h, w] tensor
-            img1 = video1[clip_timestamp].unsqueeze(0).to(device)
-            img2 = video2[clip_timestamp].unsqueeze(0).to(device)
-            loss_fn.to(device)
-            # calculate lpips of a video
-            lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
-        lpips_results.append(lpips_results_of_a_video)
-    lpips_results = np.array(lpips_results)
-    lpips = {}
-    lpips_std = {}
-    for clip_timestamp in range(len(video1)):
-        lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp])
-        lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp])
-    result = {
-        "value": lpips,
-        "value_std": lpips_std,
-        "video_setting": video1.shape,
-        "video_setting_name": "time, channel, heigth, width",
-    }
-    return result
-# test code / using example
-def main():
-    NUMBER_OF_VIDEOS = 8
-    VIDEO_LENGTH = 50
-    CHANNEL = 3
-    SIZE = 64
-    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    device = torch.device("cuda")
-    # device = torch.device("cpu")
-    import json
-    result = calculate_lpips(videos1, videos2, device)
-    print(json.dumps(result, indent=4))
-if __name__ == "__main__":
-    main()

eval/pab/commom_metrics/calculate_psnr.py DELETED Viewed

@@ -1,90 +0,0 @@
-import math
-import numpy as np
-import torch
-def img_psnr(img1, img2):
-    # [0,1]
-    # compute mse
-    # mse = np.mean((img1-img2)**2)
-    mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
-    # compute psnr
-    if mse < 1e-10:
-        return 100
-    psnr = 20 * math.log10(1 / math.sqrt(mse))
-    return psnr
-def trans(x):
-    return x
-def calculate_psnr(videos1, videos2):
-    # videos [batch_size, timestamps, channel, h, w]
-    assert videos1.shape == videos2.shape
-    videos1 = trans(videos1)
-    videos2 = trans(videos2)
-    psnr_results = []
-    for video_num in range(videos1.shape[0]):
-        # get a video
-        # video [timestamps, channel, h, w]
-        video1 = videos1[video_num]
-        video2 = videos2[video_num]
-        psnr_results_of_a_video = []
-        for clip_timestamp in range(len(video1)):
-            # get a img
-            # img [timestamps[x], channel, h, w]
-            # img [channel, h, w] numpy
-            img1 = video1[clip_timestamp].numpy()
-            img2 = video2[clip_timestamp].numpy()
-            # calculate psnr of a video
-            psnr_results_of_a_video.append(img_psnr(img1, img2))
-        psnr_results.append(psnr_results_of_a_video)
-    psnr_results = np.array(psnr_results)
-    psnr = {}
-    psnr_std = {}
-    for clip_timestamp in range(len(video1)):
-        psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
-        psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])
-    result = {
-        "value": psnr,
-        "value_std": psnr_std,
-        "video_setting": video1.shape,
-        "video_setting_name": "time, channel, heigth, width",
-    }
-    return result
-# test code / using example
-def main():
-    NUMBER_OF_VIDEOS = 8
-    VIDEO_LENGTH = 50
-    CHANNEL = 3
-    SIZE = 64
-    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    import json
-    result = calculate_psnr(videos1, videos2)
-    print(json.dumps(result, indent=4))
-if __name__ == "__main__":
-    main()

eval/pab/commom_metrics/calculate_ssim.py DELETED Viewed

@@ -1,116 +0,0 @@
-import cv2
-import numpy as np
-import torch
-def ssim(img1, img2):
-    C1 = 0.01**2
-    C2 = 0.03**2
-    img1 = img1.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    kernel = cv2.getGaussianKernel(11, 1.5)
-    window = np.outer(kernel, kernel.transpose())
-    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
-    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
-    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-    return ssim_map.mean()
-def calculate_ssim_function(img1, img2):
-    # [0,1]
-    # ssim is the only metric extremely sensitive to gray being compared to b/w
-    if not img1.shape == img2.shape:
-        raise ValueError("Input images must have the same dimensions.")
-    if img1.ndim == 2:
-        return ssim(img1, img2)
-    elif img1.ndim == 3:
-        if img1.shape[0] == 3:
-            ssims = []
-            for i in range(3):
-                ssims.append(ssim(img1[i], img2[i]))
-            return np.array(ssims).mean()
-        elif img1.shape[0] == 1:
-            return ssim(np.squeeze(img1), np.squeeze(img2))
-    else:
-        raise ValueError("Wrong input image dimensions.")
-def trans(x):
-    return x
-def calculate_ssim(videos1, videos2):
-    # videos [batch_size, timestamps, channel, h, w]
-    assert videos1.shape == videos2.shape
-    videos1 = trans(videos1)
-    videos2 = trans(videos2)
-    ssim_results = []
-    for video_num in range(videos1.shape[0]):
-        # get a video
-        # video [timestamps, channel, h, w]
-        video1 = videos1[video_num]
-        video2 = videos2[video_num]
-        ssim_results_of_a_video = []
-        for clip_timestamp in range(len(video1)):
-            # get a img
-            # img [timestamps[x], channel, h, w]
-            # img [channel, h, w] numpy
-            img1 = video1[clip_timestamp].numpy()
-            img2 = video2[clip_timestamp].numpy()
-            # calculate ssim of a video
-            ssim_results_of_a_video.append(calculate_ssim_function(img1, img2))
-        ssim_results.append(ssim_results_of_a_video)
-    ssim_results = np.array(ssim_results)
-    ssim = {}
-    ssim_std = {}
-    for clip_timestamp in range(len(video1)):
-        ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp])
-        ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp])
-    result = {
-        "value": ssim,
-        "value_std": ssim_std,
-        "video_setting": video1.shape,
-        "video_setting_name": "time, channel, heigth, width",
-    }
-    return result
-# test code / using example
-def main():
-    NUMBER_OF_VIDEOS = 8
-    VIDEO_LENGTH = 50
-    CHANNEL = 3
-    SIZE = 64
-    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
-    torch.device("cuda")
-    import json
-    result = calculate_ssim(videos1, videos2)
-    print(json.dumps(result, indent=4))
-if __name__ == "__main__":
-    main()

eval/pab/commom_metrics/eval.py DELETED Viewed

@@ -1,160 +0,0 @@
-import argparse
-import os
-import imageio
-import torch
-import torchvision.transforms.functional as F
-import tqdm
-from calculate_lpips import calculate_lpips
-from calculate_psnr import calculate_psnr
-from calculate_ssim import calculate_ssim
-def load_videos(directory, video_ids, file_extension):
-    videos = []
-    for video_id in video_ids:
-        video_path = os.path.join(directory, f"{video_id}.{file_extension}")
-        if os.path.exists(video_path):
-            video = load_video(video_path)  # Define load_video based on how videos are stored
-            videos.append(video)
-        else:
-            raise ValueError(f"Video {video_id}.{file_extension} not found in {directory}")
-    return videos
-def load_video(video_path):
-    """
-    Load a video from the given path and convert it to a PyTorch tensor.
-    """
-    # Read the video using imageio
-    reader = imageio.get_reader(video_path, "ffmpeg")
-    # Extract frames and convert to a list of tensors
-    frames = []
-    for frame in reader:
-        # Convert the frame to a tensor and permute the dimensions to match (C, H, W)
-        frame_tensor = torch.tensor(frame).cuda().permute(2, 0, 1)
-        frames.append(frame_tensor)
-    # Stack the list of tensors into a single tensor with shape (T, C, H, W)
-    video_tensor = torch.stack(frames)
-    return video_tensor
-def resize_video(video, target_height, target_width):
-    resized_frames = []
-    for frame in video:
-        resized_frame = F.resize(frame, [target_height, target_width])
-        resized_frames.append(resized_frame)
-    return torch.stack(resized_frames)
-def preprocess_eval_video(eval_video, generated_video_shape):
-    T_gen, _, H_gen, W_gen = generated_video_shape
-    T_eval, _, H_eval, W_eval = eval_video.shape
-    if T_eval < T_gen:
-        raise ValueError(f"Eval video time steps ({T_eval}) are less than generated video time steps ({T_gen}).")
-    if H_eval < H_gen or W_eval < W_gen:
-        # Resize the video maintaining the aspect ratio
-        resize_height = max(H_gen, int(H_gen * (H_eval / W_eval)))
-        resize_width = max(W_gen, int(W_gen * (W_eval / H_eval)))
-        eval_video = resize_video(eval_video, resize_height, resize_width)
-        # Recalculate the dimensions
-        T_eval, _, H_eval, W_eval = eval_video.shape
-    # Center crop
-    start_h = (H_eval - H_gen) // 2
-    start_w = (W_eval - W_gen) // 2
-    cropped_video = eval_video[:T_gen, :, start_h : start_h + H_gen, start_w : start_w + W_gen]
-    return cropped_video
-def main(args):
-    device = "cuda"
-    gt_video_dir = args.gt_video_dir
-    generated_video_dir = args.generated_video_dir
-    video_ids = []
-    file_extension = "mp4"
-    for f in os.listdir(generated_video_dir):
-        if f.endswith(f".{file_extension}"):
-            video_ids.append(f.replace(f".{file_extension}", ""))
-    if not video_ids:
-        raise ValueError("No videos found in the generated video dataset. Exiting.")
-    print(f"Find {len(video_ids)} videos")
-    prompt_interval = 1
-    batch_size = 16
-    calculate_lpips_flag, calculate_psnr_flag, calculate_ssim_flag = True, True, True
-    lpips_results = []
-    psnr_results = []
-    ssim_results = []
-    total_len = len(video_ids) // batch_size + (1 if len(video_ids) % batch_size != 0 else 0)
-    for idx, video_id in enumerate(tqdm.tqdm(range(total_len))):
-        gt_videos_tensor = []
-        generated_videos_tensor = []
-        for i in range(batch_size):
-            video_idx = idx * batch_size + i
-            if video_idx >= len(video_ids):
-                break
-            video_id = video_ids[video_idx]
-            generated_video = load_video(os.path.join(generated_video_dir, f"{video_id}.{file_extension}"))
-            generated_videos_tensor.append(generated_video)
-            eval_video = load_video(os.path.join(gt_video_dir, f"{video_id}.{file_extension}"))
-            gt_videos_tensor.append(eval_video)
-        gt_videos_tensor = (torch.stack(gt_videos_tensor) / 255.0).cpu()
-        generated_videos_tensor = (torch.stack(generated_videos_tensor) / 255.0).cpu()
-        if calculate_lpips_flag:
-            result = calculate_lpips(gt_videos_tensor, generated_videos_tensor, device=device)
-            result = result["value"].values()
-            result = sum(result) / len(result)
-            lpips_results.append(result)
-        if calculate_psnr_flag:
-            result = calculate_psnr(gt_videos_tensor, generated_videos_tensor)
-            result = result["value"].values()
-            result = sum(result) / len(result)
-            psnr_results.append(result)
-        if calculate_ssim_flag:
-            result = calculate_ssim(gt_videos_tensor, generated_videos_tensor)
-            result = result["value"].values()
-            result = sum(result) / len(result)
-            ssim_results.append(result)
-        if (idx + 1) % prompt_interval == 0:
-            out_str = ""
-            for results, name in zip([lpips_results, psnr_results, ssim_results], ["lpips", "psnr", "ssim"]):
-                result = sum(results) / len(results)
-                out_str += f"{name}: {result:.4f}, "
-            print(f"Processed {idx + 1} videos. {out_str[:-2]}")
-    out_str = ""
-    for results, name in zip([lpips_results, psnr_results, ssim_results], ["lpips", "psnr", "ssim"]):
-        result = sum(results) / len(results)
-        out_str += f"{name}: {result:.4f}, "
-    out_str = out_str[:-2]
-    # save
-    with open(f"./{os.path.basename(generated_video_dir)}.txt", "w+") as f:
-        f.write(out_str)
-    print(f"Processed all videos. {out_str}")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--gt_video_dir", type=str)
-    parser.add_argument("--generated_video_dir", type=str)
-    args = parser.parse_args()
-    main(args)

eval/pab/experiments/attention_ablation.py DELETED Viewed

@@ -1,60 +0,0 @@
-from utils import generate_func, read_prompt_list
-import videosys
-from videosys import OpenSoraConfig, OpenSoraPipeline
-from videosys.models.open_sora import OpenSoraPABConfig
-def attention_ablation_func(pab_kwargs, prompt_list, output_dir):
-    pab_config = OpenSoraPABConfig(**pab_kwargs)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, output_dir)
-def main(prompt_list):
-    # spatial
-    gap_list = [2, 3, 4, 5]
-    for gap in gap_list:
-        pab_kwargs = {
-            "spatial_broadcast": True,
-            "spatial_gap": gap,
-            "temporal_broadcast": False,
-            "cross_broadcast": False,
-            "mlp_skip": False,
-        }
-        output_dir = f"./samples/attention_ablation/spatial_g{gap}"
-        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
-    # temporal
-    gap_list = [3, 4, 5, 6]
-    for gap in gap_list:
-        pab_kwargs = {
-            "spatial_broadcast": False,
-            "temporal_broadcast": True,
-            "temporal_gap": gap,
-            "cross_broadcast": False,
-            "mlp_skip": False,
-        }
-        output_dir = f"./samples/attention_ablation/temporal_g{gap}"
-        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
-    # cross
-    gap_list = [5, 6, 7, 8]
-    for gap in gap_list:
-        pab_kwargs = {
-            "spatial_broadcast": False,
-            "temporal_broadcast": False,
-            "cross_broadcast": True,
-            "cross_gap": gap,
-            "mlp_skip": False,
-        }
-        output_dir = f"./samples/attention_ablation/cross_g{gap}"
-        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
-if __name__ == "__main__":
-    videosys.initialize(42)
-    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
-    main(prompt_list)

eval/pab/experiments/components_ablation.py DELETED Viewed

@@ -1,46 +0,0 @@
-from utils import generate_func, read_prompt_list
-import videosys
-from videosys import OpenSoraConfig, OpenSoraPipeline
-from videosys.models.open_sora import OpenSoraPABConfig
-def wo_spatial(prompt_list):
-    pab_config = OpenSoraPABConfig(spatial_broadcast=False)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_spatial")
-def wo_temporal(prompt_list):
-    pab_config = OpenSoraPABConfig(temporal_broadcast=False)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_temporal")
-def wo_cross(prompt_list):
-    pab_config = OpenSoraPABConfig(cross_broadcast=False)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_cross")
-def wo_mlp(prompt_list):
-    pab_config = OpenSoraPABConfig(mlp_skip=False)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_mlp")
-if __name__ == "__main__":
-    videosys.initialize(42)
-    prompt_list = read_prompt_list("./vbench/VBench_full_info.json")
-    wo_spatial(prompt_list)
-    wo_temporal(prompt_list)
-    wo_cross(prompt_list)
-    wo_mlp(prompt_list)

eval/pab/experiments/latte.py DELETED Viewed

@@ -1,57 +0,0 @@
-from utils import generate_func, read_prompt_list
-import videosys
-from videosys import LatteConfig, LattePipeline
-from videosys.models.latte import LattePABConfig
-def eval_base(prompt_list):
-    config = LatteConfig()
-    pipeline = LattePipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/latte_base", loop=5)
-def eval_pab1(prompt_list):
-    pab_config = LattePABConfig(
-        spatial_gap=2,
-        temporal_gap=3,
-        cross_gap=6,
-    )
-    config = LatteConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = LattePipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/latte_pab1", loop=5)
-def eval_pab2(prompt_list):
-    pab_config = LattePABConfig(
-        spatial_gap=3,
-        temporal_gap=4,
-        cross_gap=7,
-    )
-    config = LatteConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = LattePipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/latte_pab2", loop=5)
-def eval_pab3(prompt_list):
-    pab_config = LattePABConfig(
-        spatial_gap=4,
-        temporal_gap=6,
-        cross_gap=9,
-    )
-    config = LatteConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = LattePipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/latte_pab3", loop=5)
-if __name__ == "__main__":
-    videosys.initialize(42)
-    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
-    eval_base(prompt_list)
-    eval_pab1(prompt_list)
-    eval_pab2(prompt_list)
-    eval_pab3(prompt_list)

eval/pab/experiments/opensora.py DELETED Viewed

@@ -1,44 +0,0 @@
-from utils import generate_func, read_prompt_list
-import videosys
-from videosys import OpenSoraConfig, OpenSoraPipeline
-from videosys.models.open_sora import OpenSoraPABConfig
-def eval_base(prompt_list):
-    config = OpenSoraConfig()
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensora_base", loop=5)
-def eval_pab1(prompt_list):
-    config = OpenSoraConfig(enable_pab=True)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensora_pab1", loop=5)
-def eval_pab2(prompt_list):
-    pab_config = OpenSoraPABConfig(spatial_gap=3, temporal_gap=5, cross_gap=7)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensora_pab2", loop=5)
-def eval_pab3(prompt_list):
-    pab_config = OpenSoraPABConfig(spatial_gap=5, temporal_gap=7, cross_gap=9)
-    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensora_pab3", loop=5)
-if __name__ == "__main__":
-    videosys.initialize(42)
-    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
-    eval_base(prompt_list)
-    eval_pab1(prompt_list)
-    eval_pab2(prompt_list)
-    eval_pab3(prompt_list)

eval/pab/experiments/opensora_plan.py DELETED Viewed

@@ -1,57 +0,0 @@
-from utils import generate_func, read_prompt_list
-import videosys
-from videosys import OpenSoraPlanConfig, OpenSoraPlanPipeline
-from videosys.models.open_sora_plan import OpenSoraPlanPABConfig
-def eval_base(prompt_list):
-    config = OpenSoraPlanConfig()
-    pipeline = OpenSoraPlanPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensoraplan_base", loop=5)
-def eval_pab1(prompt_list):
-    pab_config = OpenSoraPlanPABConfig(
-        spatial_gap=2,
-        temporal_gap=4,
-        cross_gap=6,
-    )
-    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPlanPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab1", loop=5)
-def eval_pab2(prompt_list):
-    pab_config = OpenSoraPlanPABConfig(
-        spatial_gap=3,
-        temporal_gap=5,
-        cross_gap=7,
-    )
-    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPlanPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab2", loop=5)
-def eval_pab3(prompt_list):
-    pab_config = OpenSoraPlanPABConfig(
-        spatial_gap=5,
-        temporal_gap=7,
-        cross_gap=9,
-    )
-    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
-    pipeline = OpenSoraPlanPipeline(config)
-    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab3", loop=5)
-if __name__ == "__main__":
-    videosys.initialize(42)
-    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
-    eval_base(prompt_list)
-    eval_pab1(prompt_list)
-    eval_pab2(prompt_list)
-    eval_pab3(prompt_list)

eval/pab/experiments/utils.py DELETED Viewed

@@ -1,22 +0,0 @@
-import json
-import os
-import tqdm
-from videosys.utils.utils import set_seed
-def generate_func(pipeline, prompt_list, output_dir, loop: int = 5, kwargs: dict = {}):
-    kwargs["verbose"] = False
-    for prompt in tqdm.tqdm(prompt_list):
-        for l in range(loop):
-            set_seed(l)
-            video = pipeline.generate(prompt, **kwargs).video[0]
-            pipeline.save_video(video, os.path.join(output_dir, f"{prompt}-{l}.mp4"))
-def read_prompt_list(prompt_list_path):
-    with open(prompt_list_path, "r") as f:
-        prompt_list = json.load(f)
-    prompt_list = [prompt["prompt_en"] for prompt in prompt_list]
-    return prompt_list

eval/pab/vbench/VBench_full_info.json DELETED Viewed

The diff for this file is too large to render. See raw diff

eval/pab/vbench/cal_vbench.py DELETED Viewed

@@ -1,154 +0,0 @@
-import argparse
-import json
-import os
-SEMANTIC_WEIGHT = 1
-QUALITY_WEIGHT = 4
-QUALITY_LIST = [
-    "subject consistency",
-    "background consistency",
-    "temporal flickering",
-    "motion smoothness",
-    "aesthetic quality",
-    "imaging quality",
-    "dynamic degree",
-]
-SEMANTIC_LIST = [
-    "object class",
-    "multiple objects",
-    "human action",
-    "color",
-    "spatial relationship",
-    "scene",
-    "appearance style",
-    "temporal style",
-    "overall consistency",
-]
-NORMALIZE_DIC = {
-    "subject consistency": {"Min": 0.1462, "Max": 1.0},
-    "background consistency": {"Min": 0.2615, "Max": 1.0},
-    "temporal flickering": {"Min": 0.6293, "Max": 1.0},
-    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
-    "dynamic degree": {"Min": 0.0, "Max": 1.0},
-    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
-    "imaging quality": {"Min": 0.0, "Max": 1.0},
-    "object class": {"Min": 0.0, "Max": 1.0},
-    "multiple objects": {"Min": 0.0, "Max": 1.0},
-    "human action": {"Min": 0.0, "Max": 1.0},
-    "color": {"Min": 0.0, "Max": 1.0},
-    "spatial relationship": {"Min": 0.0, "Max": 1.0},
-    "scene": {"Min": 0.0, "Max": 0.8222},
-    "appearance style": {"Min": 0.0009, "Max": 0.2855},
-    "temporal style": {"Min": 0.0, "Max": 0.364},
-    "overall consistency": {"Min": 0.0, "Max": 0.364},
-}
-DIM_WEIGHT = {
-    "subject consistency": 1,
-    "background consistency": 1,
-    "temporal flickering": 1,
-    "motion smoothness": 1,
-    "aesthetic quality": 1,
-    "imaging quality": 1,
-    "dynamic degree": 0.5,
-    "object class": 1,
-    "multiple objects": 1,
-    "human action": 1,
-    "color": 1,
-    "spatial relationship": 1,
-    "scene": 1,
-    "appearance style": 1,
-    "temporal style": 1,
-    "overall consistency": 1,
-}
-ordered_scaled_res = [
-    "total score",
-    "quality score",
-    "semantic score",
-    "subject consistency",
-    "background consistency",
-    "temporal flickering",
-    "motion smoothness",
-    "dynamic degree",
-    "aesthetic quality",
-    "imaging quality",
-    "object class",
-    "multiple objects",
-    "human action",
-    "color",
-    "spatial relationship",
-    "scene",
-    "appearance style",
-    "temporal style",
-    "overall consistency",
-]
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--score_dir", required=True, type=str)
-    args = parser.parse_args()
-    return args
-if __name__ == "__main__":
-    args = parse_args()
-    res_postfix = "_eval_results.json"
-    info_postfix = "_full_info.json"
-    files = os.listdir(args.score_dir)
-    res_files = [x for x in files if res_postfix in x]
-    info_files = [x for x in files if info_postfix in x]
-    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
-    full_results = {}
-    for res_file in res_files:
-        # first check if results is normal
-        info_file = res_file.split(res_postfix)[0] + info_postfix
-        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
-            info = json.load(f)
-            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
-        # read results
-        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
-            data = json.load(f)
-            for key, val in data.items():
-                full_results[key] = format(val[0], ".4f")
-    scaled_results = {}
-    dims = set()
-    for key, val in full_results.items():
-        dim = key.replace("_", " ") if "_" in key else key
-        scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (
-            NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]
-        )
-        scaled_score *= DIM_WEIGHT[dim]
-        scaled_results[dim] = scaled_score
-        dims.add(dim)
-    assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"
-    quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
-    semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
-    scaled_results["quality score"] = quality_score
-    scaled_results["semantic score"] = semantic_score
-    scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (
-        QUALITY_WEIGHT + SEMANTIC_WEIGHT
-    )
-    formated_scaled_results = {"items": []}
-    for key in ordered_scaled_res:
-        formated_score = format(scaled_results[key] * 100, ".2f") + "%"
-        formated_scaled_results["items"].append({key: formated_score})
-    output_file_path = os.path.join(args.score_dir, "all_results.json")
-    with open(output_file_path, "w") as outfile:
-        json.dump(full_results, outfile, indent=4, sort_keys=True)
-    print(f"results saved to: {output_file_path}")
-    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
-    with open(scaled_file_path, "w") as outfile:
-        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
-    print(f"results saved to: {scaled_file_path}")

eval/pab/vbench/run_vbench.py DELETED Viewed

@@ -1,52 +0,0 @@
-import argparse
-import torch
-from vbench import VBench
-full_info_path = "./vbench/VBench_full_info.json"
-dimensions = [
-    "subject_consistency",
-    "imaging_quality",
-    "background_consistency",
-    "motion_smoothness",
-    "overall_consistency",
-    "human_action",
-    "multiple_objects",
-    "spatial_relationship",
-    "object_class",
-    "color",
-    "aesthetic_quality",
-    "appearance_style",
-    "temporal_flickering",
-    "scene",
-    "temporal_style",
-    "dynamic_degree",
-]
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--video_path", required=True, type=str)
-    args = parser.parse_args()
-    return args
-if __name__ == "__main__":
-    args = parse_args()
-    save_path = args.video_path.replace("/samples/", "/vbench_out/")
-    kwargs = {}
-    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
-    for dimension in dimensions:
-        my_VBench = VBench(torch.device("cuda"), full_info_path, save_path)
-        my_VBench.evaluate(
-            videos_path=args.video_path,
-            name=dimension,
-            local=False,
-            read_frame=False,
-            dimension_list=[dimension],
-            mode="vbench_standard",
-            **kwargs,
-        )

examples/cogvideo/sample.py DELETED Viewed

@@ -1,14 +0,0 @@
-from videosys import CogVideoConfig, VideoSysEngine
-def run_base():
-    config = CogVideoConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-if __name__ == "__main__":
-    run_base()

examples/latte/sample.py DELETED Viewed

@@ -1,24 +0,0 @@
-from videosys import LatteConfig, VideoSysEngine
-def run_base():
-    config = LatteConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-def run_pab():
-    config = LatteConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-if __name__ == "__main__":
-    run_base()
-    # run_pab()

examples/open_sora/sample.py DELETED Viewed

@@ -1,24 +0,0 @@
-from videosys import OpenSoraConfig, VideoSysEngine
-def run_base():
-    config = OpenSoraConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-def run_pab():
-    config = OpenSoraConfig(world_size=1, enable_pab=True)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-if __name__ == "__main__":
-    run_base()
-    run_pab()

examples/open_sora_plan/sample.py DELETED Viewed

@@ -1,24 +0,0 @@
-from videosys import OpenSoraPlanConfig, VideoSysEngine
-def run_base():
-    config = OpenSoraPlanConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-def run_pab():
-    config = OpenSoraPlanConfig(world_size=1)
-    engine = VideoSysEngine(config)
-    prompt = "Sunset over the sea."
-    video = engine.generate(prompt).video[0]
-    engine.save_video(video, f"./outputs/{prompt}.mp4")
-if __name__ == "__main__":
-    run_base()
-    # run_pab()

videosys/__init__.py CHANGED Viewed

@@ -1,19 +1,15 @@
 from .core.engine import VideoSysEngine
 from .core.parallel_mgr import initialize
-from .models.cogvideo.pipeline import CogVideoConfig, CogVideoPipeline
-from .models.latte.pipeline import LatteConfig, LattePipeline
-from .models.open_sora.pipeline import OpenSoraConfig, OpenSoraPipeline
-from .models.open_sora_plan.pipeline import OpenSoraPlanConfig, OpenSoraPlanPipeline
 __all__ = [
     "initialize",
     "VideoSysEngine",
-    "LattePipeline",
-    "LatteConfig",
-    "OpenSoraPlanPipeline",
-    "OpenSoraPlanConfig",
-    "OpenSoraPipeline",
-    "OpenSoraConfig",
-    "CogVideoConfig",
-    "CogVideoPipeline",
-]

 from .core.engine import VideoSysEngine
 from .core.parallel_mgr import initialize
+from .pipelines.cogvideox import CogVideoXConfig, CogVideoXPABConfig, CogVideoXPipeline
+from .pipelines.latte import LatteConfig, LattePABConfig, LattePipeline
+from .pipelines.open_sora import OpenSoraConfig, OpenSoraPABConfig, OpenSoraPipeline
+from .pipelines.open_sora_plan import OpenSoraPlanConfig, OpenSoraPlanPABConfig, OpenSoraPlanPipeline
 __all__ = [
     "initialize",
     "VideoSysEngine",
+    "LattePipeline", "LatteConfig", "LattePABConfig",
+    "OpenSoraPlanPipeline", "OpenSoraPlanConfig", "OpenSoraPlanPABConfig",
+    "OpenSoraPipeline", "OpenSoraConfig", "OpenSoraPABConfig",
+    "CogVideoXConfig", "CogVideoXPipeline", "CogVideoXPABConfig"
+]  # fmt: skip

videosys/core/engine.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from functools import partial
 from typing import Any, Optional
-import imageio
 import torch
 import videosys
@@ -120,8 +119,7 @@ class VideoSysEngine:
             result.get()
     def save_video(self, video, output_path):
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        imageio.mimwrite(output_path, video, fps=24)
     def shutdown(self):
         if (worker_monitor := getattr(self, "worker_monitor", None)) is not None:
@@ -129,4 +127,4 @@ class VideoSysEngine:
         torch.distributed.destroy_process_group()
     def __del__(self):
-        self.shutdown()

 from functools import partial
 from typing import Any, Optional
 import torch
 import videosys
             result.get()
     def save_video(self, video, output_path):
+        return self.driver_worker.save_video(video, output_path)
     def shutdown(self):
         if (worker_monitor := getattr(self, "worker_monitor", None)) is not None:
         torch.distributed.destroy_process_group()
     def __del__(self):
+        self.shutdown()

videosys/core/pab_mgr.py CHANGED Viewed

@@ -1,8 +1,3 @@
-import random
-import numpy as np
-import torch
 from videosys.utils.logging import logger
 PAB_MANAGER = None
@@ -12,71 +7,56 @@ class PABConfig:
     def __init__(
         self,
         steps: int,
-        cross_broadcast: bool,
-        cross_threshold: list,
-        cross_gap: int,
-        spatial_broadcast: bool,
-        spatial_threshold: list,
-        spatial_gap: int,
-        temporal_broadcast: bool,
-        temporal_threshold: list,
-        temporal_gap: int,
-        diffusion_skip: bool,
-        diffusion_timestep_respacing: list,
-        diffusion_skip_timestep: list,
-        mlp_skip: bool,
-        mlp_spatial_skip_config: dict,
-        mlp_temporal_skip_config: dict,
-        full_broadcast: bool = False,
-        full_threshold: list = None,
-        full_gap: int = 1,
     ):
         self.steps = steps
         self.cross_broadcast = cross_broadcast
         self.cross_threshold = cross_threshold
-        self.cross_gap = cross_gap
         self.spatial_broadcast = spatial_broadcast
         self.spatial_threshold = spatial_threshold
-        self.spatial_gap = spatial_gap
         self.temporal_broadcast = temporal_broadcast
         self.temporal_threshold = temporal_threshold
-        self.temporal_gap = temporal_gap
-        self.diffusion_skip = diffusion_skip
-        self.diffusion_timestep_respacing = diffusion_timestep_respacing
-        self.diffusion_skip_timestep = diffusion_skip_timestep
-        self.mlp_skip = mlp_skip
-        self.mlp_spatial_skip_config = mlp_spatial_skip_config
-        self.mlp_temporal_skip_config = mlp_temporal_skip_config
-        self.temporal_mlp_outputs = {}
-        self.spatial_mlp_outputs = {}
-        self.full_broadcast = full_broadcast
-        self.full_threshold = full_threshold
-        self.full_gap = full_gap
 class PABManager:
     def __init__(self, config: PABConfig):
         self.config: PABConfig = config
-        init_prompt = f"Init PABManager. steps: {config.steps}."
-        init_prompt += f" spatial_broadcast: {config.spatial_broadcast}, spatial_threshold: {config.spatial_threshold}, spatial_gap: {config.spatial_gap}."
-        init_prompt += f" temporal_broadcast: {config.temporal_broadcast}, temporal_threshold: {config.temporal_threshold}, temporal_gap: {config.temporal_gap}."
-        init_prompt += f" cross_broadcast: {config.cross_broadcast}, cross_threshold: {config.cross_threshold}, cross_gap: {config.cross_gap}."
-        init_prompt += f" full_broadcast: {config.full_broadcast}, full_threshold: {config.full_threshold}, full_gap: {config.full_gap}."
         logger.info(init_prompt)
     def if_broadcast_cross(self, timestep: int, count: int):
         if (
             self.config.cross_broadcast
             and (timestep is not None)
-            and (count % self.config.cross_gap != 0)
             and (self.config.cross_threshold[0] < timestep < self.config.cross_threshold[1])
         ):
             flag = True
@@ -89,7 +69,7 @@ class PABManager:
         if (
             self.config.temporal_broadcast
             and (timestep is not None)
-            and (count % self.config.temporal_gap != 0)
             and (self.config.temporal_threshold[0] < timestep < self.config.temporal_threshold[1])
         ):
             flag = True
@@ -102,7 +82,7 @@ class PABManager:
         if (
             self.config.spatial_broadcast
             and (timestep is not None)
-            and (count % self.config.spatial_gap != 0)
             and (self.config.spatial_threshold[0] < timestep < self.config.spatial_threshold[1])
         ):
             flag = True
@@ -111,19 +91,6 @@ class PABManager:
         count = (count + 1) % self.config.steps
         return flag, count
-    def if_broadcast_full(self, timestep: int, count: int, block_idx: int):
-        if (
-            self.config.full_broadcast
-            and (timestep is not None)
-            and (count % self.config.full_gap != 0)
-            and (self.config.full_threshold[0] < timestep < self.config.full_threshold[1])
-        ):
-            flag = True
-        else:
-            flag = False
-        count = (count + 1) % self.config.steps
-        return flag, count
     @staticmethod
     def _is_t_in_skip_config(all_timesteps, timestep, config):
         is_t_in_skip_config = False
@@ -139,18 +106,18 @@ class PABManager:
         return is_t_in_skip_config, skip_range
     def if_skip_mlp(self, timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
-        if not self.config.mlp_skip:
             return False, None, False, None
         if is_temporal:
-            cur_config = self.config.mlp_temporal_skip_config
         else:
-            cur_config = self.config.mlp_spatial_skip_config
         is_t_in_skip_config, skip_range = self._is_t_in_skip_config(all_timesteps, timestep, cur_config)
         next_flag = False
         if (
-            self.config.mlp_skip
             and (timestep is not None)
             and (timestep in cur_config)
             and (block_idx in cur_config[timestep]["block"])
@@ -159,7 +126,7 @@ class PABManager:
             next_flag = True
             count = count + 1
         elif (
-            self.config.mlp_skip
             and (timestep is not None)
             and (is_t_in_skip_config)
             and (block_idx in cur_config[skip_range[0]]["block"])
@@ -173,22 +140,22 @@ class PABManager:
     def save_skip_output(self, timestep, block_idx, ff_output, is_temporal=False):
         if is_temporal:
-            self.config.temporal_mlp_outputs[(timestep, block_idx)] = ff_output
         else:
-            self.config.spatial_mlp_outputs[(timestep, block_idx)] = ff_output
     def get_mlp_output(self, skip_range, timestep, block_idx, is_temporal=False):
         skip_start_t = skip_range[0]
         if is_temporal:
             skip_output = (
-                self.config.temporal_mlp_outputs.get((skip_start_t, block_idx), None)
-                if self.config.temporal_mlp_outputs is not None
                 else None
             )
         else:
             skip_output = (
-                self.config.spatial_mlp_outputs.get((skip_start_t, block_idx), None)
-                if self.config.spatial_mlp_outputs is not None
                 else None
             )
@@ -196,9 +163,9 @@ class PABManager:
             if timestep == skip_range[-1]:
                 # TODO: save memory
                 if is_temporal:
-                    del self.config.temporal_mlp_outputs[(skip_start_t, block_idx)]
                 else:
-                    del self.config.spatial_mlp_outputs[(skip_start_t, block_idx)]
         else:
             raise ValueError(
                 f"No stored MLP output found | t {timestep} |[{skip_range[0]}, {skip_range[-1]}] | block {block_idx}"
@@ -207,10 +174,10 @@ class PABManager:
         return skip_output
     def get_spatial_mlp_outputs(self):
-        return self.config.spatial_mlp_outputs
     def get_temporal_mlp_outputs(self):
-        return self.config.temporal_mlp_outputs
 def set_pab_manager(config: PABConfig):
@@ -250,11 +217,6 @@ def if_broadcast_spatial(timestep: int, count: int, block_idx: int):
         return False, count
     return PAB_MANAGER.if_broadcast_spatial(timestep, count, block_idx)
-def if_broadcast_full(timestep: int, count: int, block_idx: int):
-    if not enable_pab():
-        return False, count
-    return PAB_MANAGER.if_broadcast_full(timestep, count, block_idx)
 def if_broadcast_mlp(timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
     if not enable_pab():
@@ -268,97 +230,3 @@ def save_mlp_output(timestep: int, block_idx: int, ff_output, is_temporal=False)
 def get_mlp_output(skip_range, timestep, block_idx: int, is_temporal=False):
     return PAB_MANAGER.get_mlp_output(skip_range, timestep, block_idx, is_temporal)
-def get_diffusion_skip():
-    return enable_pab() and PAB_MANAGER.config.diffusion_skip
-def get_diffusion_timestep_respacing():
-    return PAB_MANAGER.config.diffusion_timestep_respacing
-def get_diffusion_skip_timestep():
-    return enable_pab() and PAB_MANAGER.config.diffusion_skip_timestep
-def space_timesteps(time_steps, time_bins):
-    num_bins = len(time_bins)
-    bin_size = time_steps // num_bins
-    result = []
-    for i, bin_count in enumerate(time_bins):
-        start = i * bin_size
-        end = start + bin_size
-        bin_steps = np.linspace(start, end, bin_count, endpoint=False, dtype=int).tolist()
-        result.extend(bin_steps)
-    result_tensor = torch.tensor(result, dtype=torch.int32)
-    sorted_tensor = torch.sort(result_tensor, descending=True).values
-    return sorted_tensor
-def skip_diffusion_timestep(timesteps, diffusion_skip_timestep):
-    if isinstance(timesteps, list):
-        # If timesteps is a list, we assume each element is a tensor
-        timesteps_np = [t.cpu().numpy() for t in timesteps]
-        device = timesteps[0].device
-    else:
-        # If timesteps is a tensor
-        timesteps_np = timesteps.cpu().numpy()
-        device = timesteps.device
-    num_bins = len(diffusion_skip_timestep)
-    if isinstance(timesteps_np, list):
-        bin_size = len(timesteps_np) // num_bins
-        new_timesteps = []
-        for i in range(num_bins):
-            bin_start = i * bin_size
-            bin_end = (i + 1) * bin_size if i != num_bins - 1 else len(timesteps_np)
-            bin_timesteps = timesteps_np[bin_start:bin_end]
-            if diffusion_skip_timestep[i] == 0:
-                # If the bin is marked with 0, keep all timesteps
-                new_timesteps.extend(bin_timesteps)
-            elif diffusion_skip_timestep[i] == 1:
-                # If the bin is marked with 1, omit the last timestep in the bin
-                new_timesteps.extend(bin_timesteps[1:])
-        new_timesteps_tensor = [torch.tensor(t, device=device) for t in new_timesteps]
-    else:
-        bin_size = len(timesteps_np) // num_bins
-        new_timesteps = []
-        for i in range(num_bins):
-            bin_start = i * bin_size
-            bin_end = (i + 1) * bin_size if i != num_bins - 1 else len(timesteps_np)
-            bin_timesteps = timesteps_np[bin_start:bin_end]
-            if diffusion_skip_timestep[i] == 0:
-                # If the bin is marked with 0, keep all timesteps
-                new_timesteps.extend(bin_timesteps)
-            elif diffusion_skip_timestep[i] == 1:
-                # If the bin is marked with 1, omit the last timestep in the bin
-                new_timesteps.extend(bin_timesteps[1:])
-            elif diffusion_skip_timestep[i] != 0:
-                # If the bin is marked with a non-zero value, randomly omit n timesteps
-                if len(bin_timesteps) > diffusion_skip_timestep[i]:
-                    indices_to_remove = set(random.sample(range(len(bin_timesteps)), diffusion_skip_timestep[i]))
-                    timesteps_to_keep = [
-                        timestep for idx, timestep in enumerate(bin_timesteps) if idx not in indices_to_remove
-                    ]
-                else:
-                    timesteps_to_keep = bin_timesteps  # 如果bin_timesteps的长度小于等于n，则不删除任何元素
-                new_timesteps.extend(timesteps_to_keep)
-        new_timesteps_tensor = torch.tensor(new_timesteps, device=device)
-    if isinstance(timesteps, list):
-        return new_timesteps_tensor
-    else:
-        return new_timesteps_tensor

 from videosys.utils.logging import logger
 PAB_MANAGER = None
     def __init__(
         self,
         steps: int,
+        cross_broadcast: bool = False,
+        cross_threshold: list = None,
+        cross_range: int = None,
+        spatial_broadcast: bool = False,
+        spatial_threshold: list = None,
+        spatial_range: int = None,
+        temporal_broadcast: bool = False,
+        temporal_threshold: list = None,
+        temporal_range: int = None,
+        mlp_broadcast: bool = False,
+        mlp_spatial_broadcast_config: dict = None,
+        mlp_temporal_broadcast_config: dict = None,
     ):
         self.steps = steps
         self.cross_broadcast = cross_broadcast
         self.cross_threshold = cross_threshold
+        self.cross_range = cross_range
         self.spatial_broadcast = spatial_broadcast
         self.spatial_threshold = spatial_threshold
+        self.spatial_range = spatial_range
         self.temporal_broadcast = temporal_broadcast
         self.temporal_threshold = temporal_threshold
+        self.temporal_range = temporal_range
+        self.mlp_broadcast = mlp_broadcast
+        self.mlp_spatial_broadcast_config = mlp_spatial_broadcast_config
+        self.mlp_temporal_broadcast_config = mlp_temporal_broadcast_config
+        self.mlp_temporal_outputs = {}
+        self.mlp_spatial_outputs = {}
 class PABManager:
     def __init__(self, config: PABConfig):
         self.config: PABConfig = config
+        init_prompt = f"Init Pyramid Attention Broadcast. steps: {config.steps}."
+        init_prompt += f" spatial broadcast: {config.spatial_broadcast}, spatial range: {config.spatial_range}, spatial threshold: {config.spatial_threshold}."
+        init_prompt += f" temporal broadcast: {config.temporal_broadcast}, temporal range: {config.temporal_range}, temporal_threshold: {config.temporal_threshold}."
+        init_prompt += f" cross broadcast: {config.cross_broadcast}, cross range: {config.cross_range}, cross threshold: {config.cross_threshold}."
+        init_prompt += f" mlp broadcast: {config.mlp_broadcast}."
         logger.info(init_prompt)
     def if_broadcast_cross(self, timestep: int, count: int):
         if (
             self.config.cross_broadcast
             and (timestep is not None)
+            and (count % self.config.cross_range != 0)
             and (self.config.cross_threshold[0] < timestep < self.config.cross_threshold[1])
         ):
             flag = True
         if (
             self.config.temporal_broadcast
             and (timestep is not None)
+            and (count % self.config.temporal_range != 0)
             and (self.config.temporal_threshold[0] < timestep < self.config.temporal_threshold[1])
         ):
             flag = True
         if (
             self.config.spatial_broadcast
             and (timestep is not None)
+            and (count % self.config.spatial_range != 0)
             and (self.config.spatial_threshold[0] < timestep < self.config.spatial_threshold[1])
         ):
             flag = True
         count = (count + 1) % self.config.steps
         return flag, count
     @staticmethod
     def _is_t_in_skip_config(all_timesteps, timestep, config):
         is_t_in_skip_config = False
         return is_t_in_skip_config, skip_range
     def if_skip_mlp(self, timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
+        if not self.config.mlp_broadcast:
             return False, None, False, None
         if is_temporal:
+            cur_config = self.config.mlp_temporal_broadcast_config
         else:
+            cur_config = self.config.mlp_spatial_broadcast_config
         is_t_in_skip_config, skip_range = self._is_t_in_skip_config(all_timesteps, timestep, cur_config)
         next_flag = False
         if (
+            self.config.mlp_broadcast
             and (timestep is not None)
             and (timestep in cur_config)
             and (block_idx in cur_config[timestep]["block"])
             next_flag = True
             count = count + 1
         elif (
+            self.config.mlp_broadcast
             and (timestep is not None)
             and (is_t_in_skip_config)
             and (block_idx in cur_config[skip_range[0]]["block"])
     def save_skip_output(self, timestep, block_idx, ff_output, is_temporal=False):
         if is_temporal:
+            self.config.mlp_temporal_outputs[(timestep, block_idx)] = ff_output
         else:
+            self.config.mlp_spatial_outputs[(timestep, block_idx)] = ff_output
     def get_mlp_output(self, skip_range, timestep, block_idx, is_temporal=False):
         skip_start_t = skip_range[0]
         if is_temporal:
             skip_output = (
+                self.config.mlp_temporal_outputs.get((skip_start_t, block_idx), None)
+                if self.config.mlp_temporal_outputs is not None
                 else None
             )
         else:
             skip_output = (
+                self.config.mlp_spatial_outputs.get((skip_start_t, block_idx), None)
+                if self.config.mlp_spatial_outputs is not None
                 else None
             )
             if timestep == skip_range[-1]:
                 # TODO: save memory
                 if is_temporal:
+                    del self.config.mlp_temporal_outputs[(skip_start_t, block_idx)]
                 else:
+                    del self.config.mlp_spatial_outputs[(skip_start_t, block_idx)]
         else:
             raise ValueError(
                 f"No stored MLP output found | t {timestep} |[{skip_range[0]}, {skip_range[-1]}] | block {block_idx}"
         return skip_output
     def get_spatial_mlp_outputs(self):
+        return self.config.mlp_spatial_outputs
     def get_temporal_mlp_outputs(self):
+        return self.config.mlp_temporal_outputs
 def set_pab_manager(config: PABConfig):
         return False, count
     return PAB_MANAGER.if_broadcast_spatial(timestep, count, block_idx)
 def if_broadcast_mlp(timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
     if not enable_pab():
 def get_mlp_output(skip_range, timestep, block_idx: int, is_temporal=False):
     return PAB_MANAGER.get_mlp_output(skip_range, timestep, block_idx, is_temporal)

videosys/datasets/dataloader.py DELETED Viewed

@@ -1,94 +0,0 @@
-import random
-from typing import Iterator, Optional
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, Dataset, DistributedSampler
-from torch.utils.data.distributed import DistributedSampler
-from videosys.core.parallel_mgr import ParallelManager
-class StatefulDistributedSampler(DistributedSampler):
-    def __init__(
-        self,
-        dataset: Dataset,
-        num_replicas: Optional[int] = None,
-        rank: Optional[int] = None,
-        shuffle: bool = True,
-        seed: int = 0,
-        drop_last: bool = False,
-    ) -> None:
-        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
-        self.start_index: int = 0
-    def __iter__(self) -> Iterator:
-        iterator = super().__iter__()
-        indices = list(iterator)
-        indices = indices[self.start_index :]
-        return iter(indices)
-    def __len__(self) -> int:
-        return self.num_samples - self.start_index
-    def set_start_index(self, start_index: int) -> None:
-        self.start_index = start_index
-def prepare_dataloader(
-    dataset,
-    batch_size,
-    shuffle=False,
-    seed=1024,
-    drop_last=False,
-    pin_memory=False,
-    num_workers=0,
-    pg_manager: Optional[ParallelManager] = None,
-    **kwargs,
-):
-    r"""
-    Prepare a dataloader for distributed training. The dataloader will be wrapped by
-    `torch.utils.data.DataLoader` and `StatefulDistributedSampler`.
-    Args:
-        dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-        seed (int, optional): Random worker seed for sampling, defaults to 1024.
-        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-            is not divisible by the batch size. If False and the size of dataset is not divisible by
-            the batch size, then the last batch will be smaller, defaults to False.
-        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-    Returns:
-        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-    """
-    _kwargs = kwargs.copy()
-    sampler = StatefulDistributedSampler(
-        dataset,
-        num_replicas=pg_manager.size(pg_manager.dp_axis),
-        rank=pg_manager.coordinate(pg_manager.dp_axis),
-        shuffle=shuffle,
-    )
-    # Deterministic dataloader
-    def seed_worker(worker_id):
-        worker_seed = seed
-        np.random.seed(worker_seed)
-        torch.manual_seed(worker_seed)
-        random.seed(worker_seed)
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        worker_init_fn=seed_worker,
-        drop_last=drop_last,
-        pin_memory=pin_memory,
-        num_workers=num_workers,
-        **_kwargs,
-    )

videosys/datasets/image_transform.py DELETED Viewed

@@ -1,42 +0,0 @@
-# Adapted from DiT
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT:      https://github.com/facebookresearch/DiT
-# --------------------------------------------------------
-import numpy as np
-import torchvision.transforms as transforms
-from PIL import Image
-def center_crop_arr(pil_image, image_size):
-    """
-    Center cropping implementation from ADM.
-    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
-    """
-    while min(*pil_image.size) >= 2 * image_size:
-        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
-    scale = image_size / min(*pil_image.size)
-    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
-    arr = np.array(pil_image)
-    crop_y = (arr.shape[0] - image_size) // 2
-    crop_x = (arr.shape[1] - image_size) // 2
-    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
-def get_transforms_image(image_size=256):
-    transform = transforms.Compose(
-        [
-            transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size)),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-        ]
-    )
-    return transform

videosys/datasets/video_transform.py DELETED Viewed

@@ -1,441 +0,0 @@
-# Adapted from OpenSora and Latte
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# OpenSora: https://github.com/hpcaitech/Open-Sora
-# Latte:    https://github.com/Vchitect/Latte
-# --------------------------------------------------------
-import numbers
-import random
-import numpy as np
-import torch
-from PIL import Image
-def _is_tensor_video_clip(clip):
-    if not torch.is_tensor(clip):
-        raise TypeError("clip should be Tensor. Got %s" % type(clip))
-    if not clip.ndimension() == 4:
-        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
-    return True
-def center_crop_arr(pil_image, image_size):
-    """
-    Center cropping implementation from ADM.
-    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
-    """
-    while min(*pil_image.size) >= 2 * image_size:
-        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
-    scale = image_size / min(*pil_image.size)
-    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
-    arr = np.array(pil_image)
-    crop_y = (arr.shape[0] - image_size) // 2
-    crop_x = (arr.shape[1] - image_size) // 2
-    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
-def crop(clip, i, j, h, w):
-    """
-    Args:
-        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-    """
-    if len(clip.size()) != 4:
-        raise ValueError("clip should be a 4D tensor")
-    return clip[..., i : i + h, j : j + w]
-def resize(clip, target_size, interpolation_mode):
-    if len(target_size) != 2:
-        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
-    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
-def resize_scale(clip, target_size, interpolation_mode):
-    if len(target_size) != 2:
-        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
-    H, W = clip.size(-2), clip.size(-1)
-    scale_ = target_size[0] / min(H, W)
-    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
-def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
-    """
-    Do spatial cropping and resizing to the video clip
-    Args:
-        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-        i (int): i in (i,j) i.e coordinates of the upper left corner.
-        j (int): j in (i,j) i.e coordinates of the upper left corner.
-        h (int): Height of the cropped region.
-        w (int): Width of the cropped region.
-        size (tuple(int, int)): height and width of resized clip
-    Returns:
-        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
-    """
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    clip = crop(clip, i, j, h, w)
-    clip = resize(clip, size, interpolation_mode)
-    return clip
-def center_crop(clip, crop_size):
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    h, w = clip.size(-2), clip.size(-1)
-    th, tw = crop_size
-    if h < th or w < tw:
-        raise ValueError("height and width must be no smaller than crop_size")
-    i = int(round((h - th) / 2.0))
-    j = int(round((w - tw) / 2.0))
-    return crop(clip, i, j, th, tw)
-def center_crop_using_short_edge(clip):
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    h, w = clip.size(-2), clip.size(-1)
-    if h < w:
-        th, tw = h, h
-        i = 0
-        j = int(round((w - tw) / 2.0))
-    else:
-        th, tw = w, w
-        i = int(round((h - th) / 2.0))
-        j = 0
-    return crop(clip, i, j, th, tw)
-def random_shift_crop(clip):
-    """
-    Slide along the long edge, with the short edge as crop size
-    """
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    h, w = clip.size(-2), clip.size(-1)
-    if h <= w:
-        short_edge = h
-    else:
-        short_edge = w
-    th, tw = short_edge, short_edge
-    i = torch.randint(0, h - th + 1, size=(1,)).item()
-    j = torch.randint(0, w - tw + 1, size=(1,)).item()
-    return crop(clip, i, j, th, tw)
-def to_tensor(clip):
-    """
-    Convert tensor data type from uint8 to float, divide value by 255.0 and
-    permute the dimensions of clip tensor
-    Args:
-        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
-    Return:
-        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
-    """
-    _is_tensor_video_clip(clip)
-    if not clip.dtype == torch.uint8:
-        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
-    # return clip.float().permute(3, 0, 1, 2) / 255.0
-    return clip.float() / 255.0
-def normalize(clip, mean, std, inplace=False):
-    """
-    Args:
-        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
-        mean (tuple): pixel RGB mean. Size is (3)
-        std (tuple): pixel standard deviation. Size is (3)
-    Returns:
-        normalized clip (torch.tensor): Size is (T, C, H, W)
-    """
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    if not inplace:
-        clip = clip.clone()
-    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
-    # print(mean)
-    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
-    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
-    return clip
-def hflip(clip):
-    """
-    Args:
-        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
-    Returns:
-        flipped clip (torch.tensor): Size is (T, C, H, W)
-    """
-    if not _is_tensor_video_clip(clip):
-        raise ValueError("clip should be a 4D torch.tensor")
-    return clip.flip(-1)
-class RandomCropVideo:
-    def __init__(self, size):
-        if isinstance(size, numbers.Number):
-            self.size = (int(size), int(size))
-        else:
-            self.size = size
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-        Returns:
-            torch.tensor: randomly cropped video clip.
-                size is (T, C, OH, OW)
-        """
-        i, j, h, w = self.get_params(clip)
-        return crop(clip, i, j, h, w)
-    def get_params(self, clip):
-        h, w = clip.shape[-2:]
-        th, tw = self.size
-        if h < th or w < tw:
-            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
-        if w == tw and h == th:
-            return 0, 0, h, w
-        i = torch.randint(0, h - th + 1, size=(1,)).item()
-        j = torch.randint(0, w - tw + 1, size=(1,)).item()
-        return i, j, th, tw
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(size={self.size})"
-class CenterCropResizeVideo:
-    """
-    First use the short side for cropping length,
-    center crop video, then resize to the specified size
-    """
-    def __init__(
-        self,
-        size,
-        interpolation_mode="bilinear",
-    ):
-        if isinstance(size, tuple):
-            if len(size) != 2:
-                raise ValueError(f"size should be tuple (height, width), instead got {size}")
-            self.size = size
-        else:
-            self.size = (size, size)
-        self.interpolation_mode = interpolation_mode
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-        Returns:
-            torch.tensor: scale resized / center cropped video clip.
-                size is (T, C, crop_size, crop_size)
-        """
-        clip_center_crop = center_crop_using_short_edge(clip)
-        clip_center_crop_resize = resize(
-            clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
-        )
-        return clip_center_crop_resize
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-class UCFCenterCropVideo:
-    """
-    First scale to the specified size in equal proportion to the short edge,
-    then center cropping
-    """
-    def __init__(
-        self,
-        size,
-        interpolation_mode="bilinear",
-    ):
-        if isinstance(size, tuple):
-            if len(size) != 2:
-                raise ValueError(f"size should be tuple (height, width), instead got {size}")
-            self.size = size
-        else:
-            self.size = (size, size)
-        self.interpolation_mode = interpolation_mode
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-        Returns:
-            torch.tensor: scale resized / center cropped video clip.
-                size is (T, C, crop_size, crop_size)
-        """
-        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
-        clip_center_crop = center_crop(clip_resize, self.size)
-        return clip_center_crop
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-class KineticsRandomCropResizeVideo:
-    """
-    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
-    """
-    def __init__(
-        self,
-        size,
-        interpolation_mode="bilinear",
-    ):
-        if isinstance(size, tuple):
-            if len(size) != 2:
-                raise ValueError(f"size should be tuple (height, width), instead got {size}")
-            self.size = size
-        else:
-            self.size = (size, size)
-        self.interpolation_mode = interpolation_mode
-    def __call__(self, clip):
-        clip_random_crop = random_shift_crop(clip)
-        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
-        return clip_resize
-class CenterCropVideo:
-    def __init__(
-        self,
-        size,
-        interpolation_mode="bilinear",
-    ):
-        if isinstance(size, tuple):
-            if len(size) != 2:
-                raise ValueError(f"size should be tuple (height, width), instead got {size}")
-            self.size = size
-        else:
-            self.size = (size, size)
-        self.interpolation_mode = interpolation_mode
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
-        Returns:
-            torch.tensor: center cropped video clip.
-                size is (T, C, crop_size, crop_size)
-        """
-        clip_center_crop = center_crop(clip, self.size)
-        return clip_center_crop
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-class NormalizeVideo:
-    """
-    Normalize the video clip by mean subtraction and division by standard deviation
-    Args:
-        mean (3-tuple): pixel RGB mean
-        std (3-tuple): pixel RGB standard deviation
-        inplace (boolean): whether do in-place normalization
-    """
-    def __init__(self, mean, std, inplace=False):
-        self.mean = mean
-        self.std = std
-        self.inplace = inplace
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
-        """
-        return normalize(clip, self.mean, self.std, self.inplace)
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
-class ToTensorVideo:
-    """
-    Convert tensor data type from uint8 to float, divide value by 255.0 and
-    permute the dimensions of clip tensor
-    """
-    def __init__(self):
-        pass
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
-        Return:
-            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
-        """
-        return to_tensor(clip)
-    def __repr__(self) -> str:
-        return self.__class__.__name__
-class RandomHorizontalFlipVideo:
-    """
-    Flip the video clip along the horizontal direction with a given probability
-    Args:
-        p (float): probability of the clip being flipped. Default value is 0.5
-    """
-    def __init__(self, p=0.5):
-        self.p = p
-    def __call__(self, clip):
-        """
-        Args:
-            clip (torch.tensor): Size is (T, C, H, W)
-        Return:
-            clip (torch.tensor): Size is (T, C, H, W)
-        """
-        if random.random() < self.p:
-            clip = hflip(clip)
-        return clip
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(p={self.p})"
-#  ------------------------------------------------------------
-#  ---------------------  Sampling  ---------------------------
-#  ------------------------------------------------------------
-class TemporalRandomCrop(object):
-    """Temporally crop the given frame indices at a random location.
-    Args:
-            size (int): Desired length of frames will be seen in the model.
-    """
-    def __init__(self, size):
-        self.size = size
-    def __call__(self, total_frames):
-        rand_end = max(0, total_frames - self.size - 1)
-        begin_index = random.randint(0, rand_end)
-        end_index = min(begin_index + self.size, total_frames)
-        return begin_index, end_index

videosys/diffusion/__init__.py DELETED Viewed

@@ -1,41 +0,0 @@
-# Modified from OpenAI's diffusion repos and Meta DiT
-#     DiT:   https://github.com/facebookresearch/DiT/tree/main
-#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-from . import gaussian_diffusion as gd
-from .respace import SpacedDiffusion, space_timesteps
-def create_diffusion(
-    timestep_respacing,
-    noise_schedule="linear",
-    use_kl=False,
-    sigma_small=False,
-    predict_xstart=False,
-    learn_sigma=True,
-    rescale_learned_sigmas=False,
-    diffusion_steps=1000,
-):
-    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
-    if use_kl:
-        loss_type = gd.LossType.RESCALED_KL
-    elif rescale_learned_sigmas:
-        loss_type = gd.LossType.RESCALED_MSE
-    else:
-        loss_type = gd.LossType.MSE
-    if timestep_respacing is None or timestep_respacing == "":
-        timestep_respacing = [diffusion_steps]
-    return SpacedDiffusion(
-        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
-        betas=betas,
-        model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
-        model_var_type=(
-            (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
-            if not learn_sigma
-            else gd.ModelVarType.LEARNED_RANGE
-        ),
-        loss_type=loss_type
-        # rescale_timesteps=rescale_timesteps,
-    )

videosys/diffusion/diffusion_utils.py DELETED Viewed

@@ -1,79 +0,0 @@
-# Modified from OpenAI's diffusion repos
-#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-import numpy as np
-import torch as th
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    Compute the KL divergence between two gaussians.
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, th.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for th.exp().
-    logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
-    return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
-def approx_standard_normal_cdf(x):
-    """
-    A fast approximation of the cumulative distribution function of the
-    standard normal.
-    """
-    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
-def continuous_gaussian_log_likelihood(x, *, means, log_scales):
-    """
-    Compute the log-likelihood of a continuous Gaussian distribution.
-    :param x: the targets
-    :param means: the Gaussian mean Tensor.
-    :param log_scales: the Gaussian log stddev Tensor.
-    :return: a tensor like x of log probabilities (in nats).
-    """
-    centered_x = x - means
-    inv_stdv = th.exp(-log_scales)
-    normalized_x = centered_x * inv_stdv
-    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
-    return log_probs
-def discretized_gaussian_log_likelihood(x, *, means, log_scales):
-    """
-    Compute the log-likelihood of a Gaussian distribution discretizing to a
-    given image.
-    :param x: the target images. It is assumed that this was uint8 values,
-              rescaled to the range [-1, 1].
-    :param means: the Gaussian mean Tensor.
-    :param log_scales: the Gaussian log stddev Tensor.
-    :return: a tensor like x of log probabilities (in nats).
-    """
-    assert x.shape == means.shape == log_scales.shape
-    centered_x = x - means
-    inv_stdv = th.exp(-log_scales)
-    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
-    cdf_plus = approx_standard_normal_cdf(plus_in)
-    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
-    cdf_min = approx_standard_normal_cdf(min_in)
-    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
-    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
-    cdf_delta = cdf_plus - cdf_min
-    log_probs = th.where(
-        x < -0.999,
-        log_cdf_plus,
-        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
-    )
-    assert log_probs.shape == x.shape
-    return log_probs

videosys/diffusion/gaussian_diffusion.py DELETED Viewed

@@ -1,829 +0,0 @@
-# Modified from OpenAI's diffusion repos
-#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-import enum
-import math
-import numpy as np
-import torch as th
-from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-class ModelMeanType(enum.Enum):
-    """
-    Which type of output the model predicts.
-    """
-    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
-    START_X = enum.auto()  # the model predicts x_0
-    EPSILON = enum.auto()  # the model predicts epsilon
-class ModelVarType(enum.Enum):
-    """
-    What is used as the model's output variance.
-    The LEARNED_RANGE option has been added to allow the model to predict
-    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
-    """
-    LEARNED = enum.auto()
-    FIXED_SMALL = enum.auto()
-    FIXED_LARGE = enum.auto()
-    LEARNED_RANGE = enum.auto()
-class LossType(enum.Enum):
-    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
-    RESCALED_MSE = enum.auto()  # use raw MSE loss (with RESCALED_KL when learning variances)
-    KL = enum.auto()  # use the variational lower-bound
-    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
-    def is_vb(self):
-        return self == LossType.KL or self == LossType.RESCALED_KL
-def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
-    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
-    warmup_time = int(num_diffusion_timesteps * warmup_frac)
-    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
-    return betas
-def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
-    """
-    This is the deprecated API for creating beta schedules.
-    See get_named_beta_schedule() for the new library of schedules.
-    """
-    if beta_schedule == "quad":
-        betas = (
-            np.linspace(
-                beta_start**0.5,
-                beta_end**0.5,
-                num_diffusion_timesteps,
-                dtype=np.float64,
-            )
-            ** 2
-        )
-    elif beta_schedule == "linear":
-        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
-    elif beta_schedule == "warmup10":
-        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
-    elif beta_schedule == "warmup50":
-        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
-    elif beta_schedule == "const":
-        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
-    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
-        betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
-    else:
-        raise NotImplementedError(beta_schedule)
-    assert betas.shape == (num_diffusion_timesteps,)
-    return betas
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
-    """
-    Get a pre-defined beta schedule for the given name.
-    The beta schedule library consists of beta schedules which remain similar
-    in the limit of num_diffusion_timesteps.
-    Beta schedules may be added, but should not be removed or changed once
-    they are committed to maintain backwards compatibility.
-    """
-    if schedule_name == "linear":
-        # Linear schedule from Ho et al, extended to work for any number of
-        # diffusion steps.
-        scale = 1000 / num_diffusion_timesteps
-        return get_beta_schedule(
-            "linear",
-            beta_start=scale * 0.0001,
-            beta_end=scale * 0.02,
-            num_diffusion_timesteps=num_diffusion_timesteps,
-        )
-    elif schedule_name == "squaredcos_cap_v2":
-        return betas_for_alpha_bar(
-            num_diffusion_timesteps,
-            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
-        )
-    else:
-        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-class GaussianDiffusion:
-    """
-    Utilities for training and sampling diffusion models.
-    Original ported from this codebase:
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
-    :param betas: a 1-D numpy array of betas for each diffusion timestep,
-                  starting at T and going to 1.
-    """
-    def __init__(self, *, betas, model_mean_type, model_var_type, loss_type):
-        self.model_mean_type = model_mean_type
-        self.model_var_type = model_var_type
-        self.loss_type = loss_type
-        # Use float64 for accuracy.
-        betas = np.array(betas, dtype=np.float64)
-        self.betas = betas
-        assert len(betas.shape) == 1, "betas must be 1-D"
-        assert (betas > 0).all() and (betas <= 1).all()
-        self.num_timesteps = int(betas.shape[0])
-        alphas = 1.0 - betas
-        self.alphas_cumprod = np.cumprod(alphas, axis=0)
-        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
-        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
-        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
-        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
-        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
-        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
-        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
-        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-        self.posterior_log_variance_clipped = (
-            np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
-            if len(self.posterior_variance) > 1
-            else np.array([])
-        )
-        self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
-        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
-        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-    def q_sample(self, x_start, t, noise=None):
-        """
-        Diffuse the data for a given number of diffusion steps.
-        In other words, sample from q(x_t | x_0).
-        :param x_start: the initial data batch.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :param noise: if specified, the split-out normal noise.
-        :return: A noisy version of x_start.
-        """
-        if noise is None:
-            noise = th.randn_like(x_start)
-        assert noise.shape == x_start.shape
-        return (
-            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
-            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
-        )
-    def q_posterior_mean_variance(self, x_start, x_t, t):
-        """
-        Compute the mean and variance of the diffusion posterior:
-            q(x_{t-1} | x_t, x_0)
-        """
-        assert x_start.shape == x_t.shape
-        posterior_mean = (
-            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
-            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        assert (
-            posterior_mean.shape[0]
-            == posterior_variance.shape[0]
-            == posterior_log_variance_clipped.shape[0]
-            == x_start.shape[0]
-        )
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
-        """
-        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
-        the initial x, x_0.
-        :param model: the model, which takes a signal and a batch of timesteps
-                      as input.
-        :param x: the [N x C x ...] tensor at time t.
-        :param t: a 1-D Tensor of timesteps.
-        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample. Applies before
-            clip_denoised.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :return: a dict with the following keys:
-                 - 'mean': the model mean output.
-                 - 'variance': the model variance output.
-                 - 'log_variance': the log of 'variance'.
-                 - 'pred_xstart': the prediction for x_0.
-        """
-        if model_kwargs is None:
-            model_kwargs = {}
-        B, C = x.shape[:2]
-        assert t.shape == (B,)
-        model_output = model(x, t, **model_kwargs)
-        if isinstance(model_output, tuple):
-            model_output, extra = model_output
-        else:
-            extra = None
-        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
-            assert model_output.shape == (B, C * 2, *x.shape[2:])
-            model_output, model_var_values = th.split(model_output, C, dim=1)
-            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
-            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
-            # The model_var_values is [-1, 1] for [min_var, max_var].
-            frac = (model_var_values + 1) / 2
-            model_log_variance = frac * max_log + (1 - frac) * min_log
-            model_variance = th.exp(model_log_variance)
-        else:
-            model_variance, model_log_variance = {
-                # for fixedlarge, we set the initial (log-)variance like so
-                # to get a better decoder log likelihood.
-                ModelVarType.FIXED_LARGE: (
-                    np.append(self.posterior_variance[1], self.betas[1:]),
-                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
-                ),
-                ModelVarType.FIXED_SMALL: (
-                    self.posterior_variance,
-                    self.posterior_log_variance_clipped,
-                ),
-            }[self.model_var_type]
-            model_variance = _extract_into_tensor(model_variance, t, x.shape)
-            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
-        def process_xstart(x):
-            if denoised_fn is not None:
-                x = denoised_fn(x)
-            if clip_denoised:
-                return x.clamp(-1, 1)
-            return x
-        if self.model_mean_type == ModelMeanType.START_X:
-            pred_xstart = process_xstart(model_output)
-        else:
-            pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
-        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
-        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
-        return {
-            "mean": model_mean,
-            "variance": model_variance,
-            "log_variance": model_log_variance,
-            "pred_xstart": pred_xstart,
-            "extra": extra,
-        }
-    def _predict_xstart_from_eps(self, x_t, t, eps):
-        assert x_t.shape == eps.shape
-        return (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
-            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
-        )
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
-        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
-        """
-        Compute the mean for the previous step, given a function cond_fn that
-        computes the gradient of a conditional log probability with respect to
-        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
-        condition on y.
-        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
-        """
-        gradient = cond_fn(x, t, **model_kwargs)
-        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
-        return new_mean
-    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
-        """
-        Compute what the p_mean_variance output would have been, should the
-        model's score function be conditioned by cond_fn.
-        See condition_mean() for details on cond_fn.
-        Unlike condition_mean(), this instead uses the conditioning strategy
-        from Song et al (2020).
-        """
-        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
-        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
-        out = p_mean_var.copy()
-        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
-        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
-        return out
-    def p_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-    ):
-        """
-        Sample x_{t-1} from the model at the given timestep.
-        :param model: the model to sample from.
-        :param x: the current tensor at x_{t-1}.
-        :param t: the value of t, starting at 0 for the first diffusion step.
-        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample.
-        :param cond_fn: if not None, this is a gradient function that acts
-                        similarly to the model.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :return: a dict containing the following keys:
-                 - 'sample': a random sample from the model.
-                 - 'pred_xstart': a prediction of x_0.
-        """
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        noise = th.randn_like(x)
-        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
-        if cond_fn is not None:
-            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
-        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
-        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-    def p_sample_loop(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-    ):
-        """
-        Generate samples from the model.
-        :param model: the model module.
-        :param shape: the shape of the samples, (N, C, H, W).
-        :param noise: if specified, the noise from the encoder to sample.
-                      Should be of the same shape as `shape`.
-        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample.
-        :param cond_fn: if not None, this is a gradient function that acts
-                        similarly to the model.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :param device: if specified, the device to create the samples on.
-                       If not specified, use a model parameter's device.
-        :param progress: if True, show a tqdm progress bar.
-        :return: a non-differentiable batch of samples.
-        """
-        final = None
-        for sample in self.p_sample_loop_progressive(
-            model,
-            shape,
-            noise=noise,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            cond_fn=cond_fn,
-            model_kwargs=model_kwargs,
-            device=device,
-            progress=progress,
-        ):
-            final = sample
-        return final["sample"]
-    def p_sample_loop_progressive(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-    ):
-        """
-        Generate samples from the model and yield intermediate samples from
-        each timestep of diffusion.
-        Arguments are the same as p_sample_loop().
-        Returns a generator over dicts, where each dict is the return value of
-        p_sample().
-        """
-        if device is None:
-            device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
-        if noise is not None:
-            img = noise
-        else:
-            img = th.randn(*shape, device=device)
-        indices = list(range(self.num_timesteps))[::-1]
-        if progress:
-            # Lazy import so that we don't depend on tqdm.
-            from tqdm.auto import tqdm
-            indices = tqdm(indices)
-        for i in indices:
-            t = th.tensor([i] * shape[0], device=device)
-            with th.no_grad():
-                out = self.p_sample(
-                    model,
-                    img,
-                    t,
-                    clip_denoised=clip_denoised,
-                    denoised_fn=denoised_fn,
-                    cond_fn=cond_fn,
-                    model_kwargs=model_kwargs,
-                )
-                yield out
-                img = out["sample"]
-    def ddim_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        eta=0.0,
-    ):
-        """
-        Sample x_{t-1} from the model using DDIM.
-        Same usage as p_sample().
-        """
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        if cond_fn is not None:
-            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-        # Usually our model outputs epsilon, but we re-derive it
-        # in case we used x_start or x_prev prediction.
-        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
-        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
-        sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
-        # Equation 12.
-        noise = th.randn_like(x)
-        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
-        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
-        sample = mean_pred + nonzero_mask * sigma * noise
-        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-    def ddim_reverse_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        eta=0.0,
-    ):
-        """
-        Sample x_{t+1} from the model using DDIM reverse ODE.
-        """
-        assert eta == 0.0, "Reverse ODE only for deterministic path"
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        if cond_fn is not None:
-            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-        # Usually our model outputs epsilon, but we re-derive it
-        # in case we used x_start or x_prev prediction.
-        eps = (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
-        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
-        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
-        # Equation 12. reversed
-        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
-        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
-    def ddim_sample_loop(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-        eta=0.0,
-    ):
-        """
-        Generate samples from the model using DDIM.
-        Same usage as p_sample_loop().
-        """
-        final = None
-        for sample in self.ddim_sample_loop_progressive(
-            model,
-            shape,
-            noise=noise,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            cond_fn=cond_fn,
-            model_kwargs=model_kwargs,
-            device=device,
-            progress=progress,
-            eta=eta,
-        ):
-            final = sample
-        return final["sample"]
-    def ddim_sample_loop_progressive(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-        eta=0.0,
-    ):
-        """
-        Use DDIM to sample from the model and yield intermediate samples from
-        each timestep of DDIM.
-        Same usage as p_sample_loop_progressive().
-        """
-        if device is None:
-            device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
-        if noise is not None:
-            img = noise
-        else:
-            img = th.randn(*shape, device=device)
-        indices = list(range(self.num_timesteps))[::-1]
-        if progress:
-            # Lazy import so that we don't depend on tqdm.
-            from tqdm.auto import tqdm
-            indices = tqdm(indices)
-        for i in indices:
-            t = th.tensor([i] * shape[0], device=device)
-            with th.no_grad():
-                out = self.ddim_sample(
-                    model,
-                    img,
-                    t,
-                    clip_denoised=clip_denoised,
-                    denoised_fn=denoised_fn,
-                    cond_fn=cond_fn,
-                    model_kwargs=model_kwargs,
-                    eta=eta,
-                )
-                yield out
-                img = out["sample"]
-    def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None):
-        """
-        Get a term for the variational lower-bound.
-        The resulting units are bits (rather than nats, as one might expect).
-        This allows for comparison to other papers.
-        :return: a dict with the following keys:
-                 - 'output': a shape [N] tensor of NLLs or KLs.
-                 - 'pred_xstart': the x_0 predictions.
-        """
-        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
-        out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
-        kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
-        kl = mean_flat(kl) / np.log(2.0)
-        decoder_nll = -discretized_gaussian_log_likelihood(
-            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
-        )
-        assert decoder_nll.shape == x_start.shape
-        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
-        # At the first timestep return the decoder NLL,
-        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
-        output = th.where((t == 0), decoder_nll, kl)
-        return {"output": output, "pred_xstart": out["pred_xstart"]}
-    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
-        """
-        Compute training losses for a single timestep.
-        :param model: the model to evaluate loss on.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :param t: a batch of timestep indices.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :param noise: if specified, the specific Gaussian noise to try to remove.
-        :return: a dict with the key "loss" containing a tensor of shape [N].
-                 Some mean or variance settings may also have other keys.
-        """
-        if model_kwargs is None:
-            model_kwargs = {}
-        if noise is None:
-            noise = th.randn_like(x_start)
-        x_t = self.q_sample(x_start, t, noise=noise)
-        terms = {}
-        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
-            terms["loss"] = self._vb_terms_bpd(
-                model=model,
-                x_start=x_start,
-                x_t=x_t,
-                t=t,
-                clip_denoised=False,
-                model_kwargs=model_kwargs,
-            )["output"]
-            if self.loss_type == LossType.RESCALED_KL:
-                terms["loss"] *= self.num_timesteps
-        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
-            model_output = model(x_t, t, **model_kwargs)
-            if self.model_var_type in [
-                ModelVarType.LEARNED,
-                ModelVarType.LEARNED_RANGE,
-            ]:
-                B, C = x_t.shape[:2]
-                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
-                model_output, model_var_values = th.split(model_output, C, dim=1)
-                # Learn the variance using the variational bound, but don't let
-                # it affect our mean prediction.
-                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
-                terms["vb"] = self._vb_terms_bpd(
-                    model=lambda *args, r=frozen_out: r,
-                    x_start=x_start,
-                    x_t=x_t,
-                    t=t,
-                    clip_denoised=False,
-                )["output"]
-                if self.loss_type == LossType.RESCALED_MSE:
-                    # Divide by 1000 for equivalence with initial implementation.
-                    # Without a factor of 1/1000, the VB term hurts the MSE term.
-                    terms["vb"] *= self.num_timesteps / 1000.0
-            target = {
-                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0],
-                ModelMeanType.START_X: x_start,
-                ModelMeanType.EPSILON: noise,
-            }[self.model_mean_type]
-            assert model_output.shape == target.shape == x_start.shape
-            terms["mse"] = mean_flat((target - model_output) ** 2)
-            if "vb" in terms:
-                terms["loss"] = terms["mse"] + terms["vb"]
-            else:
-                terms["loss"] = terms["mse"]
-        else:
-            raise NotImplementedError(self.loss_type)
-        return terms
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-        This term can't be optimized, as it only depends on the encoder.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
-        """
-        Compute the entire variational lower-bound, measured in bits-per-dim,
-        as well as other related quantities.
-        :param model: the model to evaluate loss on.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :param clip_denoised: if True, clip denoised samples.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :return: a dict containing the following keys:
-                 - total_bpd: the total variational lower-bound, per batch element.
-                 - prior_bpd: the prior term in the lower-bound.
-                 - vb: an [N x T] tensor of terms in the lower-bound.
-                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
-                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
-        """
-        device = x_start.device
-        batch_size = x_start.shape[0]
-        vb = []
-        xstart_mse = []
-        mse = []
-        for t in list(range(self.num_timesteps))[::-1]:
-            t_batch = th.tensor([t] * batch_size, device=device)
-            noise = th.randn_like(x_start)
-            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
-            # Calculate VLB term at the current timestep
-            with th.no_grad():
-                out = self._vb_terms_bpd(
-                    model,
-                    x_start=x_start,
-                    x_t=x_t,
-                    t=t_batch,
-                    clip_denoised=clip_denoised,
-                    model_kwargs=model_kwargs,
-                )
-            vb.append(out["output"])
-            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
-            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
-            mse.append(mean_flat((eps - noise) ** 2))
-        vb = th.stack(vb, dim=1)
-        xstart_mse = th.stack(xstart_mse, dim=1)
-        mse = th.stack(mse, dim=1)
-        prior_bpd = self._prior_bpd(x_start)
-        total_bpd = vb.sum(dim=1) + prior_bpd
-        return {
-            "total_bpd": total_bpd,
-            "prior_bpd": prior_bpd,
-            "vb": vb,
-            "xstart_mse": xstart_mse,
-            "mse": mse,
-        }
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
-    """
-    Extract values from a 1-D numpy array for a batch of indices.
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
-                            dimension equal to the length of timesteps.
-    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
-    """
-    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
-    while len(res.shape) < len(broadcast_shape):
-        res = res[..., None]
-    return res + th.zeros(broadcast_shape, device=timesteps.device)

videosys/diffusion/respace.py DELETED Viewed

@@ -1,119 +0,0 @@
-# Modified from OpenAI's diffusion repos
-#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-import numpy as np
-import torch as th
-from .gaussian_diffusion import GaussianDiffusion
-def space_timesteps(num_timesteps, section_counts):
-    """
-    Create a list of timesteps to use from an original diffusion process,
-    given the number of timesteps we want to take from equally-sized portions
-    of the original process.
-    For example, if there's 300 timesteps and the section counts are [10,15,20]
-    then the first 100 timesteps are strided to be 10 timesteps, the second 100
-    are strided to be 15 timesteps, and the final 100 are strided to be 20.
-    If the stride is a string starting with "ddim", then the fixed striding
-    from the DDIM paper is used, and only one section is allowed.
-    :param num_timesteps: the number of diffusion steps in the original
-                          process to divide up.
-    :param section_counts: either a list of numbers, or a string containing
-                           comma-separated numbers, indicating the step count
-                           per section. As a special case, use "ddimN" where N
-                           is a number of steps to use the striding from the
-                           DDIM paper.
-    :return: a set of diffusion steps from the original process to use.
-    """
-    if isinstance(section_counts, str):
-        if section_counts.startswith("ddim"):
-            desired_count = int(section_counts[len("ddim") :])
-            for i in range(1, num_timesteps):
-                if len(range(0, num_timesteps, i)) == desired_count:
-                    return set(range(0, num_timesteps, i))
-            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
-        section_counts = [int(x) for x in section_counts.split(",")]
-    size_per = num_timesteps // len(section_counts)
-    extra = num_timesteps % len(section_counts)
-    start_idx = 0
-    all_steps = []
-    for i, section_count in enumerate(section_counts):
-        size = size_per + (1 if i < extra else 0)
-        if size < section_count:
-            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
-        if section_count <= 1:
-            frac_stride = 1
-        else:
-            frac_stride = (size - 1) / (section_count - 1)
-        cur_idx = 0.0
-        taken_steps = []
-        for _ in range(section_count):
-            taken_steps.append(start_idx + round(cur_idx))
-            cur_idx += frac_stride
-        all_steps += taken_steps
-        start_idx += size
-    return set(all_steps)
-class SpacedDiffusion(GaussianDiffusion):
-    """
-    A diffusion process which can skip steps in a base diffusion process.
-    :param use_timesteps: a collection (sequence or set) of timesteps from the
-                          original diffusion process to retain.
-    :param kwargs: the kwargs to create the base diffusion process.
-    """
-    def __init__(self, use_timesteps, **kwargs):
-        self.use_timesteps = set(use_timesteps)
-        self.timestep_map = []
-        self.original_num_steps = len(kwargs["betas"])
-        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
-        last_alpha_cumprod = 1.0
-        new_betas = []
-        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
-            if i in self.use_timesteps:
-                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
-                last_alpha_cumprod = alpha_cumprod
-                self.timestep_map.append(i)
-        kwargs["betas"] = np.array(new_betas)
-        super().__init__(**kwargs)
-    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
-        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
-    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
-        return super().training_losses(self._wrap_model(model), *args, **kwargs)
-    def condition_mean(self, cond_fn, *args, **kwargs):
-        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
-    def condition_score(self, cond_fn, *args, **kwargs):
-        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
-    def _wrap_model(self, model):
-        if isinstance(model, _WrappedModel):
-            return model
-        return _WrappedModel(model, self.timestep_map, self.original_num_steps)
-    def _scale_timesteps(self, t):
-        # Scaling is done by the wrapped model.
-        return t
-class _WrappedModel:
-    def __init__(self, model, timestep_map, original_num_steps):
-        self.model = model
-        self.timestep_map = timestep_map
-        # self.rescale_timesteps = rescale_timesteps
-        self.original_num_steps = original_num_steps
-    def __call__(self, x, ts, **kwargs):
-        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
-        new_ts = map_tensor[ts]
-        # if self.rescale_timesteps:
-        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
-        return self.model(x, new_ts, **kwargs)

videosys/diffusion/timestep_sampler.py DELETED Viewed

@@ -1,143 +0,0 @@
-# Modified from OpenAI's diffusion repos
-#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-from abc import ABC, abstractmethod
-import numpy as np
-import torch as th
-import torch.distributed as dist
-def create_named_schedule_sampler(name, diffusion):
-    """
-    Create a ScheduleSampler from a library of pre-defined samplers.
-    :param name: the name of the sampler.
-    :param diffusion: the diffusion object to sample for.
-    """
-    if name == "uniform":
-        return UniformSampler(diffusion)
-    elif name == "loss-second-moment":
-        return LossSecondMomentResampler(diffusion)
-    else:
-        raise NotImplementedError(f"unknown schedule sampler: {name}")
-class ScheduleSampler(ABC):
-    """
-    A distribution over timesteps in the diffusion process, intended to reduce
-    variance of the objective.
-    By default, samplers perform unbiased importance sampling, in which the
-    objective's mean is unchanged.
-    However, subclasses may override sample() to change how the resampled
-    terms are reweighted, allowing for actual changes in the objective.
-    """
-    @abstractmethod
-    def weights(self):
-        """
-        Get a numpy array of weights, one per diffusion step.
-        The weights needn't be normalized, but must be positive.
-        """
-    def sample(self, batch_size, device):
-        """
-        Importance-sample timesteps for a batch.
-        :param batch_size: the number of timesteps.
-        :param device: the torch device to save to.
-        :return: a tuple (timesteps, weights):
-                 - timesteps: a tensor of timestep indices.
-                 - weights: a tensor of weights to scale the resulting losses.
-        """
-        w = self.weights()
-        p = w / np.sum(w)
-        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
-        indices = th.from_numpy(indices_np).long().to(device)
-        weights_np = 1 / (len(p) * p[indices_np])
-        weights = th.from_numpy(weights_np).float().to(device)
-        return indices, weights
-class UniformSampler(ScheduleSampler):
-    def __init__(self, diffusion):
-        self.diffusion = diffusion
-        self._weights = np.ones([diffusion.num_timesteps])
-    def weights(self):
-        return self._weights
-class LossAwareSampler(ScheduleSampler):
-    def update_with_local_losses(self, local_ts, local_losses):
-        """
-        Update the reweighting using losses from a model.
-        Call this method from each rank with a batch of timesteps and the
-        corresponding losses for each of those timesteps.
-        This method will perform synchronization to make sure all of the ranks
-        maintain the exact same reweighting.
-        :param local_ts: an integer Tensor of timesteps.
-        :param local_losses: a 1D Tensor of losses.
-        """
-        batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
-        dist.all_gather(
-            batch_sizes,
-            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
-        )
-        # Pad all_gather batches to be the maximum batch size.
-        batch_sizes = [x.item() for x in batch_sizes]
-        max_bs = max(batch_sizes)
-        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
-        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
-        dist.all_gather(timestep_batches, local_ts)
-        dist.all_gather(loss_batches, local_losses)
-        timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
-        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
-        self.update_with_all_losses(timesteps, losses)
-    @abstractmethod
-    def update_with_all_losses(self, ts, losses):
-        """
-        Update the reweighting using losses from a model.
-        Sub-classes should override this method to update the reweighting
-        using losses from the model.
-        This method directly updates the reweighting without synchronizing
-        between workers. It is called by update_with_local_losses from all
-        ranks with identical arguments. Thus, it should have deterministic
-        behavior to maintain state across workers.
-        :param ts: a list of int timesteps.
-        :param losses: a list of float losses, one per timestep.
-        """
-class LossSecondMomentResampler(LossAwareSampler):
-    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
-        self.diffusion = diffusion
-        self.history_per_term = history_per_term
-        self.uniform_prob = uniform_prob
-        self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
-        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
-    def weights(self):
-        if not self._warmed_up():
-            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
-        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
-        weights /= np.sum(weights)
-        weights *= 1 - self.uniform_prob
-        weights += self.uniform_prob / len(weights)
-        return weights
-    def update_with_all_losses(self, ts, losses):
-        for t, loss in zip(ts, losses):
-            if self._loss_counts[t] == self.history_per_term:
-                # Shift out the oldest loss term.
-                self._loss_history[t, :-1] = self._loss_history[t, 1:]
-                self._loss_history[t, -1] = loss
-            else:
-                self._loss_history[t, self._loss_counts[t]] = loss
-                self._loss_counts[t] += 1
-    def _warmed_up(self):
-        return (self._loss_counts == self.history_per_term).all()

{eval/pab/commom_metrics → videosys/models/autoencoders}/__init__.py RENAMED Viewed

File without changes

videosys/models/{cogvideo/autoencoder_kl.py → autoencoders/autoencoder_kl_cogvideox.py} RENAMED Viewed

@@ -20,16 +20,16 @@ from diffusers.models.activations import get_activation
 from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils import logging
 from diffusers.utils.accelerate_utils import apply_forward_hook
-from .modules import CogVideoXDownsample3D, CogVideoXUpsample3D
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class CogVideoXSafeConv3d(nn.Conv3d):
-    """
     A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
     """
@@ -61,12 +61,12 @@ class CogVideoXCausalConv3d(nn.Module):
     r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
     Args:
-        in_channels (int): Number of channels in the input tensor.
-        out_channels (int): Number of output channels.
-        kernel_size (Union[int, Tuple[int, int, int]]): Size of the convolutional kernel.
-        stride (int, optional): Stride of the convolution. Default is 1.
-        dilation (int, optional): Dilation rate of the convolution. Default is 1.
-        pad_mode (str, optional): Padding mode. Default is "constant".
     """
     def __init__(
@@ -111,19 +111,10 @@ class CogVideoXCausalConv3d(nn.Module):
         self.conv_cache = None
     def fake_context_parallel_forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        dim = self.temporal_dim
         kernel_size = self.time_kernel_size
-        if kernel_size == 1:
-            return inputs
-        inputs = inputs.transpose(0, dim)
-        if self.conv_cache is not None:
-            inputs = torch.cat([self.conv_cache.transpose(0, dim).to(inputs.device), inputs], dim=0)
-        else:
-            inputs = torch.cat([inputs[:1]] * (kernel_size - 1) + [inputs], dim=0)
-        inputs = inputs.transpose(0, dim).contiguous()
         return inputs
     def _clear_fake_context_parallel_cache(self):
@@ -131,16 +122,17 @@ class CogVideoXCausalConv3d(nn.Module):
         self.conv_cache = None
     def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        input_parallel = self.fake_context_parallel_forward(inputs)
         self._clear_fake_context_parallel_cache()
-        self.conv_cache = input_parallel[:, :, -self.time_kernel_size + 1 :].contiguous().detach().clone().cpu()
         padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
-        input_parallel = F.pad(input_parallel, padding_2d, mode="constant", value=0)
-        output_parallel = self.conv(input_parallel)
-        output = output_parallel
         return output
@@ -156,6 +148,8 @@ class CogVideoXSpatialNorm3D(nn.Module):
             The number of channels for input to group normalization layer, and output of the spatial norm layer.
         zq_channels (`int`):
             The number of channels for the quantized vector as described in the paper.
     """
     def __init__(
@@ -190,17 +184,26 @@ class CogVideoXResnetBlock3D(nn.Module):
     A 3D ResNet block used in the CogVideoX model.
     Args:
-        in_channels (int): Number of input channels.
-        out_channels (Optional[int], optional):
-            Number of output channels. If None, defaults to `in_channels`. Default is None.
-        dropout (float, optional): Dropout rate. Default is 0.0.
-        temb_channels (int, optional): Number of time embedding channels. Default is 512.
-        groups (int, optional): Number of groups for group normalization. Default is 32.
-        eps (float, optional): Epsilon value for normalization layers. Default is 1e-6.
-        non_linearity (str, optional): Activation function to use. Default is "swish".
-        conv_shortcut (bool, optional): If True, use a convolutional shortcut. Default is False.
-        spatial_norm_dim (Optional[int], optional): Dimension of the spatial normalization. Default is None.
-        pad_mode (str, optional): Padding mode. Default is "first".
     """
     def __init__(
@@ -302,18 +305,28 @@ class CogVideoXDownBlock3D(nn.Module):
     A downsampling block used in the CogVideoX model.
     Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        temb_channels (int): Number of time embedding channels.
-        dropout (float, optional): Dropout rate. Default is 0.0.
-        num_layers (int, optional): Number of layers in the block. Default is 1.
-        resnet_eps (float, optional): Epsilon value for the ResNet layers. Default is 1e-6.
-        resnet_act_fn (str, optional): Activation function for the ResNet layers. Default is "swish".
-        resnet_groups (int, optional): Number of groups for group normalization in the ResNet layers. Default is 32.
-        add_downsample (bool, optional): If True, add a downsampling layer at the end of the block. Default is True.
-        downsample_padding (int, optional): Padding for the downsampling layer. Default is 0.
-        compress_time (bool, optional): If True, apply temporal compression. Default is False.
-        pad_mode (str, optional): Padding mode. Default is "first".
     """
     _supports_gradient_checkpointing = True
@@ -398,15 +411,24 @@ class CogVideoXMidBlock3D(nn.Module):
     A middle block used in the CogVideoX model.
     Args:
-        in_channels (int): Number of input channels.
-        temb_channels (int): Number of time embedding channels.
-        dropout (float, optional): Dropout rate. Default is 0.0.
-        num_layers (int, optional): Number of layers in the block. Default is 1.
-        resnet_eps (float, optional): Epsilon value for the ResNet layers. Default is 1e-6.
-        resnet_act_fn (str, optional): Activation function for the ResNet layers. Default is "swish".
-        resnet_groups (int, optional): Number of groups for group normalization in the ResNet layers. Default is 32.
-        spatial_norm_dim (Optional[int], optional): Dimension of the spatial normalization. Default is None.
-        pad_mode (str, optional): Padding mode. Default is "first".
     """
     _supports_gradient_checkpointing = True
@@ -473,19 +495,30 @@ class CogVideoXUpBlock3D(nn.Module):
     An upsampling block used in the CogVideoX model.
     Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        temb_channels (int): Number of time embedding channels.
-        dropout (float, optional): Dropout rate. Default is 0.0.
-        num_layers (int, optional): Number of layers in the block. Default is 1.
-        resnet_eps (float, optional): Epsilon value for the ResNet layers. Default is 1e-6.
-        resnet_act_fn (str, optional): Activation function for the ResNet layers. Default is "swish".
-        resnet_groups (int, optional): Number of groups for group normalization in the ResNet layers. Default is 32.
-        spatial_norm_dim (int, optional): Dimension of the spatial normalization. Default is 16.
-        add_upsample (bool, optional): If True, add an upsampling layer at the end of the block. Default is True.
-        upsample_padding (int, optional): Padding for the upsampling layer. Default is 1.
-        compress_time (bool, optional): If True, apply temporal compression. Default is False.
-        pad_mode (str, optional): Padding mode. Default is "first".
     """
     def __init__(
@@ -576,14 +609,12 @@ class CogVideoXEncoder3D(nn.Module):
             options.
         block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
             The number of output channels for each block.
         layers_per_block (`int`, *optional*, defaults to 2):
             The number of layers per block.
         norm_num_groups (`int`, *optional*, defaults to 32):
             The number of groups for normalization.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        double_z (`bool`, *optional*, defaults to `True`):
-            Whether to double the number of output channels for the last block.
     """
     _supports_gradient_checkpointing = True
@@ -712,14 +743,12 @@ class CogVideoXDecoder3D(nn.Module):
             The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
         block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
             The number of output channels for each block.
         layers_per_block (`int`, *optional*, defaults to 2):
             The number of layers per block.
         norm_num_groups (`int`, *optional*, defaults to 32):
             The number of groups for normalization.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        norm_type (`str`, *optional*, defaults to `"group"`):
-            The normalization type to use. Can be either `"group"` or `"spatial"`.
     """
     _supports_gradient_checkpointing = True
@@ -860,7 +889,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             Tuple of block output channels.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        scaling_factor (`float`, *optional*, defaults to 0.18215):
             The component-wise standard deviation of the trained latent space computed using the first batch of the
             training set. This is used to scale the latent space to have unit variance when training the diffusion
             model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
@@ -900,7 +929,8 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         norm_eps: float = 1e-6,
         norm_num_groups: int = 32,
         temporal_compression_ratio: float = 4,
-        sample_size: int = 256,
         scaling_factor: float = 1.15258426,
         shift_factor: Optional[float] = None,
         latents_mean: Optional[Tuple[float]] = None,
@@ -939,25 +969,105 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         self.use_slicing = False
         self.use_tiling = False
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
         )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
             module.gradient_checkpointing = value
-    def clear_fake_context_parallel_cache(self):
         for name, module in self.named_modules():
             if isinstance(module, CogVideoXCausalConv3d):
                 logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
                 module._clear_fake_context_parallel_cache()
     @apply_forward_hook
     def encode(
         self, x: torch.Tensor, return_dict: bool = True
@@ -982,8 +1092,34 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
     @apply_forward_hook
-    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
         """
         Decode a batch of images.
@@ -996,13 +1132,111 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             [`~models.vae.DecoderOutput`] or `tuple`:
                 If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                 returned.
         """
-        if self.post_quant_conv is not None:
-            z = self.post_quant_conv(z)
-        dec = self.decoder(z)
         if not return_dict:
             return (dec,)
         return DecoderOutput(sample=dec)
     def forward(

 from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils.accelerate_utils import apply_forward_hook
+from videosys.utils.logging import logger
+from ..modules.downsampling import CogVideoXDownsample3D
+from ..modules.upsampling import CogVideoXUpsample3D
 class CogVideoXSafeConv3d(nn.Conv3d):
+    r"""
     A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
     """
     r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
     Args:
+        in_channels (`int`): Number of channels in the input tensor.
+        out_channels (`int`): Number of output channels produced by the convolution.
+        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
+        stride (`int`, defaults to `1`): Stride of the convolution.
+        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
+        pad_mode (`str`, defaults to `"constant"`): Padding mode.
     """
     def __init__(
         self.conv_cache = None
     def fake_context_parallel_forward(self, inputs: torch.Tensor) -> torch.Tensor:
         kernel_size = self.time_kernel_size
+        if kernel_size > 1:
+            cached_inputs = [self.conv_cache] if self.conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
+            inputs = torch.cat(cached_inputs + [inputs], dim=2)
         return inputs
     def _clear_fake_context_parallel_cache(self):
         self.conv_cache = None
     def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = self.fake_context_parallel_forward(inputs)
         self._clear_fake_context_parallel_cache()
+        # Note: we could move these to the cpu for a lower maximum memory usage but its only a few
+        # hundred megabytes and so let's not do it for now
+        self.conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
         padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
+        inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
+        output = self.conv(inputs)
         return output
             The number of channels for input to group normalization layer, and output of the spatial norm layer.
         zq_channels (`int`):
             The number of channels for the quantized vector as described in the paper.
+        groups (`int`):
+            Number of groups to separate the channels into for group normalization.
     """
     def __init__(
     A 3D ResNet block used in the CogVideoX model.
     Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        non_linearity (`str`, defaults to `"swish"`):
+            Activation function to use.
+        conv_shortcut (bool, defaults to `False`):
+            Whether or not to use a convolution shortcut.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
     """
     def __init__(
     A downsampling block used in the CogVideoX model.
     Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        add_downsample (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
     """
     _supports_gradient_checkpointing = True
     A middle block used in the CogVideoX model.
     Args:
+        in_channels (`int`):
+            Number of input channels.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
     """
     _supports_gradient_checkpointing = True
     An upsampling block used in the CogVideoX model.
     Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, defaults to `16`):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        add_upsample (`bool`, defaults to `True`):
+            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
     """
     def __init__(
             options.
         block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
             The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
         layers_per_block (`int`, *optional*, defaults to 2):
             The number of layers per block.
         norm_num_groups (`int`, *optional*, defaults to 32):
             The number of groups for normalization.
     """
     _supports_gradient_checkpointing = True
             The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
         block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
             The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
         layers_per_block (`int`, *optional*, defaults to 2):
             The number of layers per block.
         norm_num_groups (`int`, *optional*, defaults to 32):
             The number of groups for normalization.
     """
     _supports_gradient_checkpointing = True
             Tuple of block output channels.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
             The component-wise standard deviation of the trained latent space computed using the first batch of the
             training set. This is used to scale the latent space to have unit variance when training the diffusion
             model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
         norm_eps: float = 1e-6,
         norm_num_groups: int = 32,
         temporal_compression_ratio: float = 4,
+        sample_height: int = 480,
+        sample_width: int = 720,
         scaling_factor: float = 1.15258426,
         shift_factor: Optional[float] = None,
         latents_mean: Optional[Tuple[float]] = None,
         self.use_slicing = False
         self.use_tiling = False
+        # Can be increased to decode more latent frames at once, but comes at a reasonable memory cost and it is not
+        # recommended because the temporal parts of the VAE, here, are tricky to understand.
+        # If you decode X latent frames together, the number of output frames is:
+        #     (X + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) => X + 6 frames
+        #
+        # Example with num_latent_frames_batch_size = 2:
+        #     - 12 latent frames: (0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11) are processed together
+        #         => (12 // 2 frame slices) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 6 * 8 = 48 frames
+        #     - 13 latent frames: (0, 1, 2) (special case), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12) are processed together
+        #         => (1 frame slice) * ((3 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) +
+        #            ((13 - 3) // 2) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 1 * 9 + 5 * 8 = 49 frames
+        # It has been implemented this way so as to not have "magic values" in the code base that would be hard to explain. Note that
+        # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
+        # number of temporal frames.
+        self.num_latent_frames_batch_size = 2
+        # We make the minimum height and width of sample for tiling half that of the generally supported
+        self.tile_sample_min_height = sample_height // 2
+        self.tile_sample_min_width = sample_width // 2
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
         )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+        # These are experimental overlap factors that were chosen based on experimentation and seem to work best for
+        # 720x480 (WxH) resolution. The above resolution is the strongly recommended generation resolution in CogVideoX
+        # and so the tiling implementation has only been tested on those specific resolutions.
+        self.tile_overlap_factor_height = 1 / 6
+        self.tile_overlap_factor_width = 1 / 5
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
             module.gradient_checkpointing = value
+    def _clear_fake_context_parallel_cache(self):
         for name, module in self.named_modules():
             if isinstance(module, CogVideoXCausalConv3d):
                 logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
                 module._clear_fake_context_parallel_cache()
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_overlap_factor_height: Optional[float] = None,
+        tile_overlap_factor_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_overlap_factor_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+            tile_overlap_factor_width (`int`, *optional*):
+                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
+                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
+        )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
+        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
     @apply_forward_hook
     def encode(
         self, x: torch.Tensor, return_dict: bool = True
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+        frame_batch_size = self.num_latent_frames_batch_size
+        dec = []
+        for i in range(num_frames // frame_batch_size):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            z_intermediate = z[:, :, start_frame:end_frame]
+            if self.post_quant_conv is not None:
+                z_intermediate = self.post_quant_conv(z_intermediate)
+            z_intermediate = self.decoder(z_intermediate)
+            dec.append(z_intermediate)
+        self._clear_fake_context_parallel_cache()
+        dec = torch.cat(dec, dim=2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
     @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
         """
         Decode a batch of images.
             [`~models.vae.DecoderOutput`] or `tuple`:
                 If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                 returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
         """
+        # Rough memory assessment:
+        #   - In CogVideoX-2B, there are a total of 24 CausalConv3d layers.
+        #   - The biggest intermediate dimensions are: [1, 128, 9, 480, 720].
+        #   - Assume fp16 (2 bytes per value).
+        # Memory required: 1 * 128 * 9 * 480 * 720 * 24 * 2 / 1024**3 = 17.8 GB
+        #
+        # Memory assessment when using tiling:
+        #   - Assume everything as above but now HxW is 240x360 by tiling in half
+        # Memory required: 1 * 128 * 9 * 240 * 360 * 24 * 2 / 1024**3 = 4.5 GB
+        batch_size, num_channels, num_frames, height, width = z.shape
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_sample_min_height - blend_extent_height
+        row_limit_width = self.tile_sample_min_width - blend_extent_width
+        frame_batch_size = self.num_latent_frames_batch_size
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                time = []
+                for k in range(num_frames // frame_batch_size):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = z[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_latent_min_height,
+                        j : j + self.tile_latent_min_width,
+                    ]
+                    if self.post_quant_conv is not None:
+                        tile = self.post_quant_conv(tile)
+                    tile = self.decoder(tile)
+                    time.append(tile)
+                self._clear_fake_context_parallel_cache()
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+        dec = torch.cat(result_rows, dim=3)
         if not return_dict:
             return (dec,)
         return DecoderOutput(sample=dec)
     def forward(

videosys/models/{open_sora/vae.py → autoencoders/autoencoder_kl_open_sora.py} RENAMED Viewed

@@ -18,8 +18,6 @@ from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
 from einops import rearrange
 from transformers import PretrainedConfig, PreTrainedModel
-from .utils import load_checkpoint
 class DiagonalGaussianDistribution(object):
     def __init__(
@@ -474,7 +472,7 @@ class VAE_Temporal(nn.Module):
         return recon_video, posterior, z
-def VAE_Temporal_SD(from_pretrained=None, **kwargs):
     model = VAE_Temporal(
         in_out_channels=4,
         latent_embed_dim=4,
@@ -485,8 +483,6 @@ def VAE_Temporal_SD(from_pretrained=None, **kwargs):
         temporal_downsample=(False, True, True),
         **kwargs,
     )
-    if from_pretrained is not None:
-        load_checkpoint(model, from_pretrained)
     return model
@@ -634,7 +630,7 @@ class VideoAutoencoderPipeline(PreTrainedModel):
             micro_batch_size=4,
             subfolder="vae",
         )
-        self.temporal_vae = VAE_Temporal_SD(from_pretrained=None)
         self.cal_loss = config.cal_loss
         self.micro_frame_size = config.micro_frame_size
         self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0]
@@ -763,7 +759,4 @@ def OpenSoraVAE_V1_2(
     else:
         config = VideoAutoencoderPipelineConfig(**kwargs)
         model = VideoAutoencoderPipeline(config)
-        if from_pretrained:
-            load_checkpoint(model, from_pretrained)
     return model

 from einops import rearrange
 from transformers import PretrainedConfig, PreTrainedModel
 class DiagonalGaussianDistribution(object):
     def __init__(
         return recon_video, posterior, z
+def VAE_Temporal_SD(**kwargs):
     model = VAE_Temporal(
         in_out_channels=4,
         latent_embed_dim=4,
         temporal_downsample=(False, True, True),
         **kwargs,
     )
     return model
             micro_batch_size=4,
             subfolder="vae",
         )
+        self.temporal_vae = VAE_Temporal_SD()
         self.cal_loss = config.cal_loss
         self.micro_frame_size = config.micro_frame_size
         self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0]
     else:
         config = VideoAutoencoderPipelineConfig(**kwargs)
         model = VideoAutoencoderPipeline(config)
     return model

videosys/models/{open_sora_plan/ae.py → autoencoders/autoencoder_kl_open_sora_plan.py} RENAMED Viewed

@@ -6,20 +6,24 @@
 # References:
 # Open-Sora-Plan: https://github.com/PKU-YuanGroup/Open-Sora-Plan
 # --------------------------------------------------------
 import glob
-import importlib
 import os
 from typing import Optional, Tuple, Union
 import numpy as np
 import torch
 from diffusers import ConfigMixin, ModelMixin
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from einops import rearrange
 from torch import nn
 def Normalize(in_channels, num_groups=32):
     return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
@@ -80,13 +84,7 @@ class DiagonalGaussianDistribution(object):
 def resolve_str_to_obj(str_val, append=True):
-    if append:
-        str_val = "videosys.models.open_sora_plan.modules." + str_val
-    if "opensora.models.ae.videobase." in str_val:
-        str_val = str_val.replace("opensora.models.ae.videobase.", "videosys.models.open_sora_plan.")
-    module_name, class_name = str_val.rsplit(".", 1)
-    module = importlib.import_module(module_name)
-    return getattr(module, class_name)
 class VideoBaseAE_PL(ModelMixin, ConfigMixin):
@@ -130,7 +128,6 @@ class VideoBaseAE_PL(ModelMixin, ConfigMixin):
             model.init_from_ckpt(last_ckpt_file)
             return model
         else:
-            print(f"Loading model from {pretrained_model_name_or_path}")
             return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -431,8 +428,6 @@ class CausalVAEModel(VideoBaseAE_PL):
         self.learning_rate = lr
         self.lr_g_factor = 1.0
-        self.loss = resolve_str_to_obj(loss_type, append=False)(**loss_params)
         self.encoder = Encoder(
             z_channels=z_channels,
             hidden_size=hidden_size,
@@ -471,8 +466,6 @@ class CausalVAEModel(VideoBaseAE_PL):
         quant_conv_cls = resolve_str_to_obj(q_conv)
         self.quant_conv = quant_conv_cls(2 * z_channels, 2 * embed_dim, 1)
         self.post_quant_conv = quant_conv_cls(embed_dim, z_channels, 1)
-        if hasattr(self.loss, "discriminator"):
-            self.automatic_optimization = False
     def encode(self, x):
         if self.use_tiling and (
@@ -855,3 +848,793 @@ def getae_wrapper(ae):
     ae = videobase_ae.get(ae, None)
     assert ae is not None
     return ae

 # References:
 # Open-Sora-Plan: https://github.com/PKU-YuanGroup/Open-Sora-Plan
 # --------------------------------------------------------
 import glob
 import os
 from typing import Optional, Tuple, Union
 import numpy as np
 import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
 from diffusers import ConfigMixin, ModelMixin
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import logging
 from einops import rearrange
 from torch import nn
+logging.set_verbosity_error()
 def Normalize(in_channels, num_groups=32):
     return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
 def resolve_str_to_obj(str_val, append=True):
+    return globals()[str_val]
 class VideoBaseAE_PL(ModelMixin, ConfigMixin):
             model.init_from_ckpt(last_ckpt_file)
             return model
         else:
             return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
         self.learning_rate = lr
         self.lr_g_factor = 1.0
         self.encoder = Encoder(
             z_channels=z_channels,
             hidden_size=hidden_size,
         quant_conv_cls = resolve_str_to_obj(q_conv)
         self.quant_conv = quant_conv_cls(2 * z_channels, 2 * embed_dim, 1)
         self.post_quant_conv = quant_conv_cls(embed_dim, z_channels, 1)
     def encode(self, x):
         if self.use_tiling and (
     ae = videobase_ae.get(ae, None)
     assert ae is not None
     return ae
+def video_to_image(func):
+    def wrapper(self, x, *args, **kwargs):
+        if x.dim() == 5:
+            t = x.shape[2]
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+            x = func(self, x, *args, **kwargs)
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        return x
+    return wrapper
+class Block(nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+class LinearAttention(Block):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+class AttnBlock3D(Block):
+    """Compatible with old versions, there are issues, use with caution."""
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, t, h, w = q.shape
+        q = q.reshape(b * t, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b * t, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b * t, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, t, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class AttnBlock3DFix(nn.Module):
+    """
+    Thanks to https://github.com/PKU-YuanGroup/Open-Sora-Plan/pull/172.
+    """
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        # q: (b c t h w) -> (b t c h w) -> (b*t c h*w) -> (b*t h*w c)
+        b, c, t, h, w = q.shape
+        q = q.permute(0, 2, 1, 3, 4)
+        q = q.reshape(b * t, c, h * w)
+        q = q.permute(0, 2, 1)
+        # k: (b c t h w) -> (b t c h w) -> (b*t c h*w)
+        k = k.permute(0, 2, 1, 3, 4)
+        k = k.reshape(b * t, c, h * w)
+        # w: (b*t hw hw)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        # v: (b c t h w) -> (b t c h w) -> (bt c hw)
+        # w_: (bt hw hw) -> (bt hw hw)
+        v = v.permute(0, 2, 1, 3, 4)
+        v = v.reshape(b * t, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        # h_: (b*t c hw) -> (b t c h w) -> (b c t h w)
+        h_ = h_.reshape(b, t, c, h, w)
+        h_ = h_.permute(0, 2, 1, 3, 4)
+        h_ = self.proj_out(h_)
+        return x + h_
+class AttnBlock(Block):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    @video_to_image
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class TemporalAttnBlock(Block):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, t, h, w = q.shape
+        q = rearrange(q, "b c t h w -> (b h w) t c")
+        k = rearrange(k, "b c t h w -> (b h w) c t")
+        v = rearrange(v, "b c t h w -> (b h w) c t")
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = rearrange(h_, "(b h w) c t -> b c t h w", h=h, w=w)
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none", "vanilla3D"], f"attn_type {attn_type} unknown"
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    print(attn_type)
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "vanilla3D":
+        return AttnBlock3D(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+class Conv2d(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]] = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[str, int, Tuple[int]] = 0,
+        dilation: Union[int, Tuple[int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+    @video_to_image
+    def forward(self, x):
+        return super().forward(x)
+class CausalConv3d(nn.Module):
+    def __init__(
+        self, chan_in, chan_out, kernel_size: Union[int, Tuple[int, int, int]], init_method="random", **kwargs
+    ):
+        super().__init__()
+        self.kernel_size = cast_tuple(kernel_size, 3)
+        self.time_kernel_size = self.kernel_size[0]
+        self.chan_in = chan_in
+        self.chan_out = chan_out
+        stride = kwargs.pop("stride", 1)
+        padding = kwargs.pop("padding", 0)
+        padding = list(cast_tuple(padding, 3))
+        padding[0] = 0
+        stride = cast_tuple(stride, 3)
+        self.conv = nn.Conv3d(chan_in, chan_out, self.kernel_size, stride=stride, padding=padding)
+        self._init_weights(init_method)
+    def _init_weights(self, init_method):
+        torch.tensor(self.kernel_size)
+        if init_method == "avg":
+            assert self.kernel_size[1] == 1 and self.kernel_size[2] == 1, "only support temporal up/down sample"
+            assert self.chan_in == self.chan_out, "chan_in must be equal to chan_out"
+            weight = torch.zeros((self.chan_out, self.chan_in, *self.kernel_size))
+            eyes = torch.concat(
+                [
+                    torch.eye(self.chan_in).unsqueeze(-1) * 1 / 3,
+                    torch.eye(self.chan_in).unsqueeze(-1) * 1 / 3,
+                    torch.eye(self.chan_in).unsqueeze(-1) * 1 / 3,
+                ],
+                dim=-1,
+            )
+            weight[:, :, :, 0, 0] = eyes
+            self.conv.weight = nn.Parameter(
+                weight,
+                requires_grad=True,
+            )
+        elif init_method == "zero":
+            self.conv.weight = nn.Parameter(
+                torch.zeros((self.chan_out, self.chan_in, *self.kernel_size)),
+                requires_grad=True,
+            )
+        if self.conv.bias is not None:
+            nn.init.constant_(self.conv.bias, 0)
+    def forward(self, x):
+        # 1 + 16   16 as video, 1 as image
+        first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.time_kernel_size - 1, 1, 1))  # b c t h w
+        x = torch.concatenate((first_frame_pad, x), dim=2)  # 3 + 16
+        return self.conv(x)
+class GroupNorm(Block):
+    def __init__(self, num_channels, num_groups=32, eps=1e-6, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.norm = torch.nn.GroupNorm(num_groups=num_groups, num_channels=num_channels, eps=1e-6, affine=True)
+    def forward(self, x):
+        return self.norm(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3)
+            std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).permute(1, 0, 2, 3)
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height * width * torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
+    n_dims = len(x.shape)
+    if src_dim < 0:
+        src_dim = n_dims + src_dim
+    if dest_dim < 0:
+        dest_dim = n_dims + dest_dim
+    assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
+    dims = list(range(n_dims))
+    del dims[src_dim]
+    permutation = []
+    ctr = 0
+    for i in range(n_dims):
+        if i == dest_dim:
+            permutation.append(src_dim)
+        else:
+            permutation.append(dims[ctr])
+            ctr += 1
+    x = x.permute(permutation)
+    if make_contiguous:
+        x = x.contiguous()
+    return x
+class Codebook(nn.Module):
+    def __init__(self, n_codes, embedding_dim):
+        super().__init__()
+        self.register_buffer("embeddings", torch.randn(n_codes, embedding_dim))
+        self.register_buffer("N", torch.zeros(n_codes))
+        self.register_buffer("z_avg", self.embeddings.data.clone())
+        self.n_codes = n_codes
+        self.embedding_dim = embedding_dim
+        self._need_init = True
+    def _tile(self, x):
+        d, ew = x.shape
+        if d < self.n_codes:
+            n_repeats = (self.n_codes + d - 1) // d
+            std = 0.01 / np.sqrt(ew)
+            x = x.repeat(n_repeats, 1)
+            x = x + torch.randn_like(x) * std
+        return x
+    def _init_embeddings(self, z):
+        # z: [b, c, t, h, w]
+        self._need_init = False
+        flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
+        y = self._tile(flat_inputs)
+        y.shape[0]
+        _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
+        if dist.is_initialized():
+            dist.broadcast(_k_rand, 0)
+        self.embeddings.data.copy_(_k_rand)
+        self.z_avg.data.copy_(_k_rand)
+        self.N.data.copy_(torch.ones(self.n_codes))
+    def forward(self, z):
+        # z: [b, c, t, h, w]
+        if self._need_init and self.training:
+            self._init_embeddings(z)
+        flat_inputs = shift_dim(z, 1, -1).flatten(end_dim=-2)
+        distances = (
+            (flat_inputs**2).sum(dim=1, keepdim=True)
+            - 2 * flat_inputs @ self.embeddings.t()
+            + (self.embeddings.t() ** 2).sum(dim=0, keepdim=True)
+        )
+        encoding_indices = torch.argmin(distances, dim=1)
+        encode_onehot = F.one_hot(encoding_indices, self.n_codes).type_as(flat_inputs)
+        encoding_indices = encoding_indices.view(z.shape[0], *z.shape[2:])
+        embeddings = F.embedding(encoding_indices, self.embeddings)
+        embeddings = shift_dim(embeddings, -1, 1)
+        commitment_loss = 0.25 * F.mse_loss(z, embeddings.detach())
+        # EMA codebook update
+        if self.training:
+            n_total = encode_onehot.sum(dim=0)
+            encode_sum = flat_inputs.t() @ encode_onehot
+            if dist.is_initialized():
+                dist.all_reduce(n_total)
+                dist.all_reduce(encode_sum)
+            self.N.data.mul_(0.99).add_(n_total, alpha=0.01)
+            self.z_avg.data.mul_(0.99).add_(encode_sum.t(), alpha=0.01)
+            n = self.N.sum()
+            weights = (self.N + 1e-7) / (n + self.n_codes * 1e-7) * n
+            encode_normalized = self.z_avg / weights.unsqueeze(1)
+            self.embeddings.data.copy_(encode_normalized)
+            y = self._tile(flat_inputs)
+            _k_rand = y[torch.randperm(y.shape[0])][: self.n_codes]
+            if dist.is_initialized():
+                dist.broadcast(_k_rand, 0)
+            usage = (self.N.view(self.n_codes, 1) >= 1).float()
+            self.embeddings.data.mul_(usage).add_(_k_rand * (1 - usage))
+        embeddings_st = (embeddings - z).detach() + z
+        avg_probs = torch.mean(encode_onehot, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        return dict(
+            embeddings=embeddings_st,
+            encodings=encoding_indices,
+            commitment_loss=commitment_loss,
+            perplexity=perplexity,
+        )
+    def dictionary_lookup(self, encodings):
+        embeddings = F.embedding(encodings, self.embeddings)
+        return embeddings
+class ResnetBlock2D(Block):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    @video_to_image
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        x = x + h
+        return x
+class ResnetBlock3D(Block):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = CausalConv3d(in_channels, out_channels, 3, padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = CausalConv3d(out_channels, out_channels, 3, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = CausalConv3d(in_channels, out_channels, 3, padding=1)
+            else:
+                self.nin_shortcut = CausalConv3d(in_channels, out_channels, 1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Upsample(Block):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.with_conv = True
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+    @video_to_image
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(Block):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.with_conv = True
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=0)
+    @video_to_image
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class SpatialDownsample2x(Block):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int]] = (3, 3),
+        stride: Union[int, Tuple[int]] = (2, 2),
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 2)
+        stride = cast_tuple(stride, 2)
+        self.chan_in = chan_in
+        self.chan_out = chan_out
+        self.kernel_size = kernel_size
+        self.conv = CausalConv3d(self.chan_in, self.chan_out, (1,) + self.kernel_size, stride=(1,) + stride, padding=0)
+    def forward(self, x):
+        pad = (0, 1, 0, 1, 0, 0)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class SpatialUpsample2x(Block):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int]] = (3, 3),
+        stride: Union[int, Tuple[int]] = (1, 1),
+    ):
+        super().__init__()
+        self.chan_in = chan_in
+        self.chan_out = chan_out
+        self.kernel_size = kernel_size
+        self.conv = CausalConv3d(self.chan_in, self.chan_out, (1,) + self.kernel_size, stride=(1,) + stride, padding=1)
+    def forward(self, x):
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> b (c t) h w")
+        x = F.interpolate(x, scale_factor=(2, 2), mode="nearest")
+        x = rearrange(x, "b (c t) h w -> b c t h w", t=t)
+        x = self.conv(x)
+        return x
+class TimeDownsample2x(Block):
+    def __init__(self, chan_in, chan_out, kernel_size: int = 3):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.conv = nn.AvgPool3d((kernel_size, 1, 1), stride=(2, 1, 1))
+    def forward(self, x):
+        first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.kernel_size - 1, 1, 1))
+        x = torch.concatenate((first_frame_pad, x), dim=2)
+        return self.conv(x)
+class TimeUpsample2x(Block):
+    def __init__(self, chan_in, chan_out):
+        super().__init__()
+    def forward(self, x):
+        if x.size(2) > 1:
+            x, x_ = x[:, :, :1], x[:, :, 1:]
+            x_ = F.interpolate(x_, scale_factor=(2, 1, 1), mode="trilinear")
+            x = torch.concat([x, x_], dim=2)
+        return x
+class TimeDownsampleRes2x(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        mix_factor: float = 2.0,
+    ):
+        super().__init__()
+        self.kernel_size = cast_tuple(kernel_size, 3)
+        self.avg_pool = nn.AvgPool3d((kernel_size, 1, 1), stride=(2, 1, 1))
+        self.conv = nn.Conv3d(in_channels, out_channels, self.kernel_size, stride=(2, 1, 1), padding=(0, 1, 1))
+        self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+    def forward(self, x):
+        alpha = torch.sigmoid(self.mix_factor)
+        first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.kernel_size[0] - 1, 1, 1))
+        x = torch.concatenate((first_frame_pad, x), dim=2)
+        return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(x)
+class TimeUpsampleRes2x(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        mix_factor: float = 2.0,
+    ):
+        super().__init__()
+        self.conv = CausalConv3d(in_channels, out_channels, kernel_size, padding=1)
+        self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+    def forward(self, x):
+        alpha = torch.sigmoid(self.mix_factor)
+        if x.size(2) > 1:
+            x, x_ = x[:, :, :1], x[:, :, 1:]
+            x_ = F.interpolate(x_, scale_factor=(2, 1, 1), mode="trilinear")
+            x = torch.concat([x, x_], dim=2)
+        return alpha * x + (1 - alpha) * self.conv(x)
+class TimeDownsampleResAdv2x(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        mix_factor: float = 1.5,
+    ):
+        super().__init__()
+        self.kernel_size = cast_tuple(kernel_size, 3)
+        self.avg_pool = nn.AvgPool3d((kernel_size, 1, 1), stride=(2, 1, 1))
+        self.attn = TemporalAttnBlock(in_channels)
+        self.res = ResnetBlock3D(in_channels=in_channels, out_channels=in_channels, dropout=0.0)
+        self.conv = nn.Conv3d(in_channels, out_channels, self.kernel_size, stride=(2, 1, 1), padding=(0, 1, 1))
+        self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+    def forward(self, x):
+        first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.kernel_size[0] - 1, 1, 1))
+        x = torch.concatenate((first_frame_pad, x), dim=2)
+        alpha = torch.sigmoid(self.mix_factor)
+        return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(self.attn((self.res(x))))
+class TimeUpsampleResAdv2x(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        mix_factor: float = 1.5,
+    ):
+        super().__init__()
+        self.res = ResnetBlock3D(in_channels=in_channels, out_channels=in_channels, dropout=0.0)
+        self.attn = TemporalAttnBlock(in_channels)
+        self.norm = Normalize(in_channels=in_channels)
+        self.conv = CausalConv3d(in_channels, out_channels, kernel_size, padding=1)
+        self.mix_factor = torch.nn.Parameter(torch.Tensor([mix_factor]))
+    def forward(self, x):
+        if x.size(2) > 1:
+            x, x_ = x[:, :, :1], x[:, :, 1:]
+            x_ = F.interpolate(x_, scale_factor=(2, 1, 1), mode="trilinear")
+            x = torch.concat([x, x_], dim=2)
+        alpha = torch.sigmoid(self.mix_factor)
+        return alpha * x + (1 - alpha) * self.conv(self.attn(self.res(x)))

videosys/models/cogvideo/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-from .pipeline import CogVideoConfig, CogVideoPipeline
-__all__ = [
-    "CogVideoConfig",
-    "CogVideoPipeline",
-]

videosys/models/cogvideo/modules.py DELETED Viewed

@@ -1,317 +0,0 @@
-# Adapted from CogVideo
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# CogVideo: https://github.com/THUDM/CogVideo
-# diffusers: https://github.com/huggingface/diffusers
-# --------------------------------------------------------
-from typing import Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.models.embeddings import get_1d_sincos_pos_embed_from_grid, get_2d_sincos_pos_embed_from_grid
-class CogVideoXDownsample3D(nn.Module):
-    # Todo: Wait for paper relase.
-    r"""
-    A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI
-    Args:
-        in_channels (`int`):
-            Number of channels in the input image.
-        out_channels (`int`):
-            Number of channels produced by the convolution.
-        kernel_size (`int`, defaults to `3`):
-            Size of the convolving kernel.
-        stride (`int`, defaults to `2`):
-            Stride of the convolution.
-        padding (`int`, defaults to `0`):
-            Padding added to all four sides of the input.
-        compress_time (`bool`, defaults to `False`):
-            Whether or not to compress the time dimension.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 2,
-        padding: int = 0,
-        compress_time: bool = False,
-    ):
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.compress_time = compress_time
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.compress_time:
-            batch_size, channels, frames, height, width = x.shape
-            # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames)
-            x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames)
-            if x.shape[-1] % 2 == 1:
-                x_first, x_rest = x[..., 0], x[..., 1:]
-                if x_rest.shape[-1] > 0:
-                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
-                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
-                x = torch.cat([x_first[..., None], x_rest], dim=-1)
-                # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width)
-                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
-            else:
-                # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2)
-                x = F.avg_pool1d(x, kernel_size=2, stride=2)
-                # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
-                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
-        # Pad the tensor
-        pad = (0, 1, 0, 1)
-        x = F.pad(x, pad, mode="constant", value=0)
-        batch_size, channels, frames, height, width = x.shape
-        # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width)
-        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width)
-        x = self.conv(x)
-        # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width)
-        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
-        return x
-class CogVideoXUpsample3D(nn.Module):
-    r"""
-    A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase.
-    Args:
-        in_channels (`int`):
-            Number of channels in the input image.
-        out_channels (`int`):
-            Number of channels produced by the convolution.
-        kernel_size (`int`, defaults to `3`):
-            Size of the convolving kernel.
-        stride (`int`, defaults to `1`):
-            Stride of the convolution.
-        padding (`int`, defaults to `1`):
-            Padding added to all four sides of the input.
-        compress_time (`bool`, defaults to `False`):
-            Whether or not to compress the time dimension.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: int = 1,
-        compress_time: bool = False,
-    ) -> None:
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.compress_time = compress_time
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        if self.compress_time:
-            if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
-                # split first frame
-                x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
-                x_first = F.interpolate(x_first, scale_factor=2.0)
-                x_rest = F.interpolate(x_rest, scale_factor=2.0)
-                x_first = x_first[:, :, None, :, :]
-                inputs = torch.cat([x_first, x_rest], dim=2)
-            elif inputs.shape[2] > 1:
-                inputs = F.interpolate(inputs, scale_factor=2.0)
-            else:
-                inputs = inputs.squeeze(2)
-                inputs = F.interpolate(inputs, scale_factor=2.0)
-                inputs = inputs[:, :, None, :, :]
-        else:
-            # only interpolate 2D
-            b, c, t, h, w = inputs.shape
-            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-            inputs = F.interpolate(inputs, scale_factor=2.0)
-            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
-        b, c, t, h, w = inputs.shape
-        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        inputs = self.conv(inputs)
-        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
-        return inputs
-def get_3d_sincos_pos_embed(
-    embed_dim: int,
-    spatial_size: Union[int, Tuple[int, int]],
-    temporal_size: int,
-    spatial_interpolation_scale: float = 1.0,
-    temporal_interpolation_scale: float = 1.0,
-) -> np.ndarray:
-    r"""
-    Args:
-        embed_dim (`int`):
-        spatial_size (`int` or `Tuple[int, int]`):
-        temporal_size (`int`):
-        spatial_interpolation_scale (`float`, defaults to 1.0):
-        temporal_interpolation_scale (`float`, defaults to 1.0):
-    """
-    if embed_dim % 4 != 0:
-        raise ValueError("`embed_dim` must be divisible by 4")
-    if isinstance(spatial_size, int):
-        spatial_size = (spatial_size, spatial_size)
-    embed_dim_spatial = 3 * embed_dim // 4
-    embed_dim_temporal = embed_dim // 4
-    # 1. Spatial
-    grid_h = np.arange(spatial_size[1], dtype=np.float32) / spatial_interpolation_scale
-    grid_w = np.arange(spatial_size[0], dtype=np.float32) / spatial_interpolation_scale
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-    grid = grid.reshape([2, 1, spatial_size[1], spatial_size[0]])
-    pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
-    # 2. Temporal
-    grid_t = np.arange(temporal_size, dtype=np.float32) / temporal_interpolation_scale
-    pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)
-    # 3. Concat
-    pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
-    pos_embed_spatial = np.repeat(pos_embed_spatial, temporal_size, axis=0)  # [T, H*W, D // 4 * 3]
-    pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
-    pos_embed_temporal = np.repeat(pos_embed_temporal, spatial_size[0] * spatial_size[1], axis=1)  # [T, H*W, D // 4]
-    pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)  # [T, H*W, D]
-    return pos_embed
-class CogVideoXPatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: int = 2,
-        in_channels: int = 16,
-        embed_dim: int = 1920,
-        text_embed_dim: int = 4096,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.patch_size = patch_size
-        self.proj = nn.Conv2d(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
-        )
-        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
-    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
-        r"""
-        Args:
-            text_embeds (`torch.Tensor`):
-                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
-            image_embeds (`torch.Tensor`):
-                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
-        """
-        text_embeds = self.text_proj(text_embeds)
-        batch, num_frames, channels, height, width = image_embeds.shape
-        image_embeds = image_embeds.reshape(-1, channels, height, width)
-        image_embeds = self.proj(image_embeds)
-        image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
-        image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
-        image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
-        embeds = torch.cat(
-            [text_embeds, image_embeds], dim=1
-        ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
-        return embeds
-class CogVideoXLayerNormZero(nn.Module):
-    def __init__(
-        self,
-        conditioning_dim: int,
-        embedding_dim: int,
-        elementwise_affine: bool = True,
-        eps: float = 1e-5,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(conditioning_dim, 6 * embedding_dim, bias=bias)
-        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
-    def forward(
-        self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
-        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
-        encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
-        return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
-class AdaLayerNorm(nn.Module):
-    r"""
-    Norm layer modified to incorporate timestep embeddings.
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        num_embeddings (`int`, *optional*): The size of the embeddings dictionary.
-        output_dim (`int`, *optional*):
-        norm_elementwise_affine (`bool`, defaults to `False):
-        norm_eps (`bool`, defaults to `False`):
-        chunk_dim (`int`, defaults to `0`):
-    """
-    def __init__(
-        self,
-        embedding_dim: int,
-        num_embeddings: Optional[int] = None,
-        output_dim: Optional[int] = None,
-        norm_elementwise_affine: bool = False,
-        norm_eps: float = 1e-5,
-        chunk_dim: int = 0,
-    ):
-        super().__init__()
-        self.chunk_dim = chunk_dim
-        output_dim = output_dim or embedding_dim * 2
-        if num_embeddings is not None:
-            self.emb = nn.Embedding(num_embeddings, embedding_dim)
-        else:
-            self.emb = None
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, output_dim)
-        self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine)
-    def forward(
-        self, x: torch.Tensor, timestep: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        if self.emb is not None:
-            temb = self.emb(timestep)
-        temb = self.linear(self.silu(temb))
-        if self.chunk_dim == 1:
-            # This is a bit weird why we have the order of "shift, scale" here and "scale, shift" in the
-            # other if-branch. This branch is specific to CogVideoX for now.
-            shift, scale = temb.chunk(2, dim=1)
-            shift = shift[:, None, :]
-            scale = scale[:, None, :]
-        else:
-            scale, shift = temb.chunk(2, dim=0)
-        x = self.norm(x) * (1 + scale) + shift
-        return x

videosys/models/cogvideo/retrieve_timesteps.py DELETED Viewed

@@ -1,74 +0,0 @@
-# Adapted from CogVideo
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# CogVideo: https://github.com/THUDM/CogVideo
-# diffusers: https://github.com/huggingface/diffusers
-# --------------------------------------------------------
-import inspect
-from typing import List, Optional, Union
-import torch
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps

videosys/models/latte/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .pipeline import LatteConfig, LattePABConfig, LattePipeline
-__all__ = [
-    "LattePipeline",
-    "LattePABConfig",
-    "LatteConfig",
-]

{eval/pab/experiments → videosys/models/modules}/__init__.py RENAMED Viewed

File without changes

videosys/models/modules/activations.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import torch.nn as nn
2	+
3	+ approx_gelu = lambda: nn.GELU(approximate="tanh")

videosys/{modules/attn.py → models/modules/attentions.py} RENAMED Viewed

@@ -1,12 +1,8 @@
-from dataclasses import dataclass
-from typing import Iterable, List, Optional, Sequence, Tuple
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import torch.utils.checkpoint
-from videosys.modules.layers import LlamaRMSNorm
 class Attention(nn.Module):
@@ -19,8 +15,9 @@ class Attention(nn.Module):
         attn_drop: float = 0.0,
         proj_drop: float = 0.0,
         norm_layer: nn.Module = LlamaRMSNorm,
-        enable_flashattn: bool = False,
         rope=None,
     ) -> None:
         super().__init__()
         assert dim % num_heads == 0, "dim should be divisible by num_heads"
@@ -28,11 +25,12 @@ class Attention(nn.Module):
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.scale = self.head_dim**-0.5
-        self.enable_flashattn = enable_flashattn
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
         self.proj_drop = nn.Dropout(proj_drop)
@@ -44,18 +42,32 @@ class Attention(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
         qkv = self.qkv(x)
-        qkv = qkv.view(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
         q, k, v = qkv.unbind(0)
-        if self.rope:
-            q = self.rotary_emb(q)
-            k = self.rotary_emb(k)
-        q, k = self.q_norm(q), self.k_norm(k)
-        if self.enable_flashattn:
             from flash_attn import flash_attn_func
             x = flash_attn_func(
                 q,
                 k,
@@ -64,13 +76,17 @@ class Attention(nn.Module):
                 softmax_scale=self.scale,
             )
         else:
-            q, k, v = map(lambda t: t.permute(0, 2, 1, 3), (q, k, v))
-            x = F.scaled_dot_product_attention(
-                q, k, v, scale=self.scale, dropout_p=self.attn_drop.p if self.training else 0.0
-            )
         x_output_shape = (B, N, C)
-        if not self.enable_flashattn:
             x = x.transpose(1, 2)
         x = x.reshape(x_output_shape)
         x = self.proj(x)
@@ -79,139 +95,37 @@ class Attention(nn.Module):
 class MultiHeadCrossAttention(nn.Module):
-    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0, enable_flashattn=False):
         super(MultiHeadCrossAttention, self).__init__()
         assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
         self.d_model = d_model
         self.num_heads = num_heads
         self.head_dim = d_model // num_heads
-        self.enable_flashattn = enable_flashattn
         self.q_linear = nn.Linear(d_model, d_model)
         self.kv_linear = nn.Linear(d_model, d_model * 2)
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(d_model, d_model)
         self.proj_drop = nn.Dropout(proj_drop)
-        self.last_out = None
-        self.count = 0
-    def forward(self, x, cond, mask=None, timestep=None):
         # query/value: img tokens; key: condition; mask: if padding tokens
         B, N, C = x.shape
         q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
         kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
         k, v = kv.unbind(2)
-        x = self.flash_attn_impl(q, k, v, mask, B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-    def flash_attn_impl(self, q, k, v, mask, B, N, C):
-        from flash_attn import flash_attn_varlen_func
-        q_seqinfo = _SeqLenInfo.from_seqlens([N] * B)
-        k_seqinfo = _SeqLenInfo.from_seqlens(mask)
-        x = flash_attn_varlen_func(
-            q.view(-1, self.num_heads, self.head_dim),
-            k.view(-1, self.num_heads, self.head_dim),
-            v.view(-1, self.num_heads, self.head_dim),
-            cu_seqlens_q=q_seqinfo.seqstart.cuda(),
-            cu_seqlens_k=k_seqinfo.seqstart.cuda(),
-            max_seqlen_q=q_seqinfo.max_seqlen,
-            max_seqlen_k=k_seqinfo.max_seqlen,
-            dropout_p=self.attn_drop.p if self.training else 0.0,
-        )
-        x = x.view(B, N, C)
-        return x
-    def torch_impl(self, q, k, v, mask, B, N, C):
-        q = q.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        k = k.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        v = v.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        attn_mask = torch.zeros(B, N, k.shape[2], dtype=torch.float32, device=q.device)
-        for i, m in enumerate(mask):
-            attn_mask[i, :, m:] = -1e8
-        scale = 1 / q.shape[-1] ** 0.5
-        q = q * scale
-        attn = q @ k.transpose(-2, -1)
-        attn = attn.to(torch.float32)
         if mask is not None:
-            attn = attn + attn_mask.unsqueeze(1)
-        attn = attn.softmax(-1)
-        attn = attn.to(v.dtype)
-        out = attn @ v
-        x = out.transpose(1, 2).contiguous().view(B, N, C)
         return x
-@dataclass
-class _SeqLenInfo:
-    """
-    copied from xformers
-    (Internal) Represents the division of a dimension into blocks.
-    For example, to represents a dimension of length 7 divided into
-    three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`.
-    The members will be:
-        max_seqlen: 3
-        min_seqlen: 2
-        seqstart_py: [0, 2, 5, 7]
-        seqstart: torch.IntTensor([0, 2, 5, 7])
-    """
-    seqstart: torch.Tensor
-    max_seqlen: int
-    min_seqlen: int
-    seqstart_py: List[int]
-    def to(self, device: torch.device) -> None:
-        self.seqstart = self.seqstart.to(device, non_blocking=True)
-    def intervals(self) -> Iterable[Tuple[int, int]]:
-        yield from zip(self.seqstart_py, self.seqstart_py[1:])
-    @classmethod
-    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
-        """
-        Input tensors are assumed to be in shape [B, M, *]
-        """
-        assert not isinstance(seqlens, torch.Tensor)
-        seqstart_py = [0]
-        max_seqlen = -1
-        min_seqlen = -1
-        for seqlen in seqlens:
-            min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen
-            max_seqlen = max(max_seqlen, seqlen)
-            seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen)
-        seqstart = torch.tensor(seqstart_py, dtype=torch.int32)
-        return cls(
-            max_seqlen=max_seqlen,
-            min_seqlen=min_seqlen,
-            seqstart=seqstart,
-            seqstart_py=seqstart_py,
-        )
-    def split(self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None) -> List[torch.Tensor]:
-        if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1:
-            raise ValueError(
-                f"Invalid `torch.Tensor` of shape {x.shape}, expected format "
-                f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n"
-                f" seqstart: {self.seqstart_py}"
-            )
-        if batch_sizes is None:
-            batch_sizes = [1] * (len(self.seqstart_py) - 1)
-        split_chunks = []
-        it = 0
-        for batch_size in batch_sizes:
-            split_chunks.append(self.seqstart_py[it + batch_size] - self.seqstart_py[it])
-            it += batch_size
-        return [
-            tensor.reshape([bs, -1, *tensor.shape[2:]]) for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1))
-        ]

 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
+from videosys.models.modules.normalization import LlamaRMSNorm
 class Attention(nn.Module):
         attn_drop: float = 0.0,
         proj_drop: float = 0.0,
         norm_layer: nn.Module = LlamaRMSNorm,
+        enable_flash_attn: bool = False,
         rope=None,
+        qk_norm_legacy: bool = False,
     ) -> None:
         super().__init__()
         assert dim % num_heads == 0, "dim should be divisible by num_heads"
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.scale = self.head_dim**-0.5
+        self.enable_flash_attn = enable_flash_attn
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.qk_norm_legacy = qk_norm_legacy
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
         self.proj_drop = nn.Dropout(proj_drop)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
+        # flash attn is not memory efficient for small sequences, this is empirical
+        enable_flash_attn = self.enable_flash_attn and (N > B)
         qkv = self.qkv(x)
+        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4)
         q, k, v = qkv.unbind(0)
+        if self.qk_norm_legacy:
+            # WARNING: this may be a bug
+            if self.rope:
+                q = self.rotary_emb(q)
+                k = self.rotary_emb(k)
+            q, k = self.q_norm(q), self.k_norm(k)
+        else:
+            q, k = self.q_norm(q), self.k_norm(k)
+            if self.rope:
+                q = self.rotary_emb(q)
+                k = self.rotary_emb(k)
+        if enable_flash_attn:
             from flash_attn import flash_attn_func
+            # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
             x = flash_attn_func(
                 q,
                 k,
                 softmax_scale=self.scale,
             )
         else:
+            dtype = q.dtype
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)  # translate attn to float32
+            attn = attn.to(torch.float32)
+            attn = attn.softmax(dim=-1)
+            attn = attn.to(dtype)  # cast back attn to original dtype
+            attn = self.attn_drop(attn)
+            x = attn @ v
         x_output_shape = (B, N, C)
+        if not enable_flash_attn:
             x = x.transpose(1, 2)
         x = x.reshape(x_output_shape)
         x = self.proj(x)
 class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
         super(MultiHeadCrossAttention, self).__init__()
         assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
         self.d_model = d_model
         self.num_heads = num_heads
         self.head_dim = d_model // num_heads
         self.q_linear = nn.Linear(d_model, d_model)
         self.kv_linear = nn.Linear(d_model, d_model * 2)
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(d_model, d_model)
         self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, cond, mask=None):
         # query/value: img tokens; key: condition; mask: if padding tokens
         B, N, C = x.shape
         q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
         kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
         k, v = kv.unbind(2)
+        attn_bias = None
+        # TODO: support torch computation
+        import xformers.ops
         if mask is not None:
+            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        x = x.view(B, -1, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
         return x

videosys/models/modules/downsampling.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CogVideoXDownsample3D(nn.Module):
+    # Todo: Wait for paper relase.
+    r"""
+    A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `2`):
+            Stride of the convolution.
+        padding (`int`, defaults to `0`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 2,
+        padding: int = 0,
+        compress_time: bool = False,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            batch_size, channels, frames, height, width = x.shape
+            # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames)
+            x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames)
+            if x.shape[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
+                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+            else:
+                # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2)
+                x = F.avg_pool1d(x, kernel_size=2, stride=2)
+                # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+        # Pad the tensor
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        batch_size, channels, frames, height, width = x.shape
+        # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width)
+        x = self.conv(x)
+        # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width)
+        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+        return x

videosys/models/{open_sora/modules.py → modules/embeddings.py} RENAMED Viewed

@@ -1,16 +1,8 @@
-# Adapted from OpenSora
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# OpenSora: https://github.com/hpcaitech/Open-Sora
-# --------------------------------------------------------
 import functools
 import math
-from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -18,40 +10,48 @@ import torch.utils.checkpoint
 from einops import rearrange
 from timm.models.vision_transformer import Mlp
-approx_gelu = lambda: nn.GELU(approximate="tanh")
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
         super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool):
-    return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine)
-def t2i_modulate(x, shift, scale):
-    return x * (1 + scale) + shift
-# ===============================================
-# General-purpose Layers
-# ===============================================
-class PatchEmbed3D(nn.Module):
     """Video to Patch Embedding.
     Args:
@@ -104,176 +104,6 @@ class PatchEmbed3D(nn.Module):
         return x
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = False,
-        qk_norm: bool = False,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-        norm_layer: nn.Module = LlamaRMSNorm,
-        enable_flash_attn: bool = False,
-        rope=None,
-        qk_norm_legacy: bool = False,
-    ) -> None:
-        super().__init__()
-        assert dim % num_heads == 0, "dim should be divisible by num_heads"
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.scale = self.head_dim**-0.5
-        self.enable_flash_attn = enable_flash_attn
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.qk_norm_legacy = qk_norm_legacy
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.rope = False
-        if rope is not None:
-            self.rope = True
-            self.rotary_emb = rope
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, N, C = x.shape
-        # flash attn is not memory efficient for small sequences, this is empirical
-        enable_flash_attn = self.enable_flash_attn and (N > B)
-        qkv = self.qkv(x)
-        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
-        qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.unbind(0)
-        if self.qk_norm_legacy:
-            # WARNING: this may be a bug
-            if self.rope:
-                q = self.rotary_emb(q)
-                k = self.rotary_emb(k)
-            q, k = self.q_norm(q), self.k_norm(k)
-        else:
-            q, k = self.q_norm(q), self.k_norm(k)
-            if self.rope:
-                q = self.rotary_emb(q)
-                k = self.rotary_emb(k)
-        if enable_flash_attn:
-            from flash_attn import flash_attn_func
-            # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
-            q = q.permute(0, 2, 1, 3)
-            k = k.permute(0, 2, 1, 3)
-            v = v.permute(0, 2, 1, 3)
-            x = flash_attn_func(
-                q,
-                k,
-                v,
-                dropout_p=self.attn_drop.p if self.training else 0.0,
-                softmax_scale=self.scale,
-            )
-        else:
-            dtype = q.dtype
-            q = q * self.scale
-            attn = q @ k.transpose(-2, -1)  # translate attn to float32
-            attn = attn.to(torch.float32)
-            attn = attn.softmax(dim=-1)
-            attn = attn.to(dtype)  # cast back attn to original dtype
-            attn = self.attn_drop(attn)
-            x = attn @ v
-        x_output_shape = (B, N, C)
-        if not enable_flash_attn:
-            x = x.transpose(1, 2)
-        x = x.reshape(x_output_shape)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class MultiHeadCrossAttention(nn.Module):
-    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
-        super(MultiHeadCrossAttention, self).__init__()
-        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.head_dim = d_model // num_heads
-        self.q_linear = nn.Linear(d_model, d_model)
-        self.kv_linear = nn.Linear(d_model, d_model * 2)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(d_model, d_model)
-        self.proj_drop = nn.Dropout(proj_drop)
-    def forward(self, x, cond, mask=None):
-        # query/value: img tokens; key: condition; mask: if padding tokens
-        B, N, C = x.shape
-        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
-        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
-        k, v = kv.unbind(2)
-        attn_bias = None
-        # TODO: support torch computation
-        import xformers.ops
-        if mask is not None:
-            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
-        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
-        x = x.view(B, -1, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class T2IFinalLayer(nn.Module):
-    """
-    The final layer of PixArt.
-    """
-    def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
-        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5)
-        self.out_channels = out_channels
-        self.d_t = d_t
-        self.d_s = d_s
-    def t_mask_select(self, x_mask, x, masked_x, T, S):
-        # x: [B, (T, S), C]
-        # mased_x: [B, (T, S), C]
-        # x_mask: [B, T]
-        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
-        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
-        x = torch.where(x_mask[:, :, None, None], x, masked_x)
-        x = rearrange(x, "B T S C -> B (T S) C")
-        return x
-    def forward(self, x, t, x_mask=None, t0=None, T=None, S=None):
-        if T is None:
-            T = self.d_t
-        if S is None:
-            S = self.d_s
-        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
-        x = t2i_modulate(self.norm_final(x), shift, scale)
-        if x_mask is not None:
-            shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1)
-            x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero)
-            x = self.t_mask_select(x_mask, x, x_zero, T, S)
-        x = self.linear(x)
-        return x
-# ===============================================
-# Embedding Layers for Timesteps and Class Labels
-# ===============================================
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
@@ -350,7 +180,7 @@ class SizeEmbedder(TimestepEmbedder):
         return next(self.parameters()).dtype
-class CaptionEmbedder(nn.Module):
     """
     Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
     """
@@ -398,7 +228,7 @@ class CaptionEmbedder(nn.Module):
         return caption
-class PositionEmbedding2D(nn.Module):
     def __init__(self, dim: int) -> None:
         super().__init__()
         self.dim = dim
@@ -448,3 +278,135 @@ class PositionEmbedding2D(nn.Module):
         base_size: Optional[int] = None,
     ) -> torch.Tensor:
         return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)

 import functools
 import math
+from typing import Optional, Tuple, Union
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from timm.models.vision_transformer import Mlp
+class CogVideoXPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        embed_dim: int = 1920,
+        text_embed_dim: int = 4096,
+        bias: bool = True,
+    ) -> None:
         super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        text_embeds = self.text_proj(text_embeds)
+        batch, num_frames, channels, height, width = image_embeds.shape
+        image_embeds = image_embeds.reshape(-1, channels, height, width)
+        image_embeds = self.proj(image_embeds)
+        image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
+        image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+        image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+        embeds = torch.cat(
+            [text_embeds, image_embeds], dim=1
+        ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
+        return embeds
+class OpenSoraPatchEmbed3D(nn.Module):
     """Video to Patch Embedding.
     Args:
         return x
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
         return next(self.parameters()).dtype
+class OpenSoraCaptionEmbedder(nn.Module):
     """
     Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
     """
         return caption
+class OpenSoraPositionEmbedding2D(nn.Module):
     def __init__(self, dim: int) -> None:
         super().__init__()
         self.dim = dim
         base_size: Optional[int] = None,
     ) -> torch.Tensor:
         return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)
+def get_3d_rotary_pos_embed(
+    embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+    use_real (`bool`):
+        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    start, stop = crops_coords
+    grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
+    grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
+    grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+    # Temporal frequencies
+    freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
+    grid_t = torch.from_numpy(grid_t).float()
+    freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
+    freqs_t = freqs_t.repeat_interleave(2, dim=-1)
+    # Spatial frequencies for height and width
+    freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
+    freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
+    grid_h = torch.from_numpy(grid_h).float()
+    grid_w = torch.from_numpy(grid_w).float()
+    freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
+    freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
+    freqs_h = freqs_h.repeat_interleave(2, dim=-1)
+    freqs_w = freqs_w.repeat_interleave(2, dim=-1)
+    # Broadcast and concatenate tensors along specified dimension
+    def broadcast(tensors, dim=-1):
+        num_tensors = len(tensors)
+        shape_lens = {len(t.shape) for t in tensors}
+        assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+        shape_len = list(shape_lens)[0]
+        dim = (dim + shape_len) if dim < 0 else dim
+        dims = list(zip(*(list(t.shape) for t in tensors)))
+        expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+        assert all(
+            [*(len(set(t[1])) <= 2 for t in expandable_dims)]
+        ), "invalid dimensions for broadcastable concatenation"
+        max_dims = [(t[0], max(t[1])) for t in expandable_dims]
+        expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
+        expanded_dims.insert(dim, (dim, dims[dim]))
+        expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
+        tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
+        return torch.cat(tensors, dim=dim)
+    freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
+    t, h, w, d = freqs.shape
+    freqs = freqs.view(t * h * w, d)
+    # Generate sine and cosine components
+    sin = freqs.sin()
+    cos = freqs.cos()
+    if use_real:
+        return cos, sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs_cis
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Use for example in Lumina
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Use for example in Stable Audio
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)

videosys/models/modules/normalization.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class CogVideoXLayerNormZero(nn.Module):
+    def __init__(
+        self,
+        conditioning_dim: int,
+        embedding_dim: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_dim, 6 * embedding_dim, bias=bias)
+        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(
+        self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+        encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
+        return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
+class AdaLayerNorm(nn.Module):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`, *optional*): The size of the embeddings dictionary.
+        output_dim (`int`, *optional*):
+        norm_elementwise_affine (`bool`, defaults to `False):
+        norm_eps (`bool`, defaults to `False`):
+        chunk_dim (`int`, defaults to `0`):
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_embeddings: Optional[int] = None,
+        output_dim: Optional[int] = None,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        chunk_dim: int = 0,
+    ):
+        super().__init__()
+        self.chunk_dim = chunk_dim
+        output_dim = output_dim or embedding_dim * 2
+        if num_embeddings is not None:
+            self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        else:
+            self.emb = None
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, output_dim)
+        self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine)
+    def forward(
+        self, x: torch.Tensor, timestep: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if self.emb is not None:
+            temb = self.emb(timestep)
+        temb = self.linear(self.silu(temb))
+        if self.chunk_dim == 1:
+            # This is a bit weird why we have the order of "shift, scale" here and "scale, shift" in the
+            # other if-branch. This branch is specific to CogVideoX for now.
+            shift, scale = temb.chunk(2, dim=1)
+            shift = shift[:, None, :]
+            scale = scale[:, None, :]
+        else:
+            scale, shift = temb.chunk(2, dim=0)
+        x = self.norm(x) * (1 + scale) + shift
+        return x

videosys/models/modules/upsampling.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CogVideoXUpsample3D(nn.Module):
+    r"""
+    A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase.
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `1`):
+            Stride of the convolution.
+        padding (`int`, defaults to `1`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: int = 1,
+        compress_time: bool = False,
+    ) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
+                # split first frame
+                x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
+                x_first = F.interpolate(x_first, scale_factor=2.0)
+                x_rest = F.interpolate(x_rest, scale_factor=2.0)
+                x_first = x_first[:, :, None, :, :]
+                inputs = torch.cat([x_first, x_rest], dim=2)
+            elif inputs.shape[2] > 1:
+                inputs = F.interpolate(inputs, scale_factor=2.0)
+            else:
+                inputs = inputs.squeeze(2)
+                inputs = F.interpolate(inputs, scale_factor=2.0)
+                inputs = inputs[:, :, None, :, :]
+        else:
+            # only interpolate 2D
+            b, c, t, h, w = inputs.shape
+            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+            inputs = F.interpolate(inputs, scale_factor=2.0)
+            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
+        b, c, t, h, w = inputs.shape
+        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        inputs = self.conv(inputs)
+        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
+        return inputs