HReynaud commited on
Commit
cfb9037
·
verified ·
1 Parent(s): f74e63f

Upload folder using huggingface_hub

Browse files
.vscode/launch.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python Debugger: Current File",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "program": "${file}",
12
+ "console": "integratedTerminal",
13
+ "justMyCode": false
14
+ }
15
+ ]
16
+ }
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: EchoNet Synthetic
3
- emoji: 🏃
4
- colorFrom: purple
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 4.31.4
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: EchoNet Synthetic
3
+ emoji: 🦀
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import torch
5
+ import torch.nn as nn
6
+ import diffusers
7
+ from einops import rearrange
8
+ from PIL import Image
9
+ from omegaconf import OmegaConf
10
+ from tqdm import tqdm
11
+ import cv2
12
+
13
+ NUM_STEPS = 64
14
+ FRAMES = 192
15
+ FPS=32
16
+
17
+ mycss = """
18
+ .contain {
19
+ width: 1000px;
20
+ margin: 0 auto;
21
+ }
22
+
23
+ .svelte-1pijsyv {
24
+ width: 448px;
25
+ }
26
+
27
+ .arrow {
28
+ display: flex;
29
+ align-items: center;
30
+ margin: 7px 0;
31
+ }
32
+
33
+ .arrow-tail {
34
+ width: 270px;
35
+ height: 50px;
36
+ background-color: black;
37
+ transition: background-color 0.3s;
38
+ }
39
+
40
+ .arrow-head {
41
+ width: 0;
42
+ height: 0;
43
+ border-top: 70px solid transparent;
44
+ border-bottom: 70px solid transparent;
45
+ border-left: 120px solid black;
46
+ transition: border-left-color 0.3s;
47
+ }
48
+
49
+ @media (prefers-color-scheme: dark) {
50
+ .arrow-tail {
51
+ background-color: white;
52
+ }
53
+ .arrow-head {
54
+ border-left-color: white;
55
+ }
56
+ }
57
+
58
+ """
59
+
60
+ myhtml = """
61
+ <div class="arrow">
62
+ <div class="arrow-tail"></div>
63
+ <div class="arrow-head"></div>
64
+ </div>
65
+ """
66
+
67
+ myjs = """
68
+ function setLoopTrue() {
69
+ let videos = document.getElementsByTagName('video');
70
+ if (videos.length > 0) {
71
+ document.getElementsByTagName('video')[0].loop = true;
72
+ }
73
+ setTimeout(setLoopTrue, 3000);
74
+ }
75
+ """
76
+
77
+ def load_model(path):
78
+
79
+ # find config.json
80
+ json_path = os.path.join(path, "config.json")
81
+ assert os.path.exists(json_path), f"Could not find config.json at {json_path}"
82
+ with open(json_path, "r") as f:
83
+ config = json.load(f)
84
+
85
+ # instantiate class
86
+ klass_name = config["_class_name"]
87
+ klass = getattr(diffusers, klass_name, None)
88
+ if klass is None:
89
+ klass = globals().get(klass_name, None)
90
+ assert klass is not None, f"Could not find class {klass_name} in diffusers or global scope."
91
+ assert getattr(klass, "from_pretrained", None) is not None, f"Class {klass_name} does not support 'from_pretrained'."
92
+
93
+ # load checkpoint
94
+ model = klass.from_pretrained(path)
95
+
96
+ return model, config
97
+
98
+ def load_scheduler(config):
99
+ scheduler_kwargs = OmegaConf.to_container(config.noise_scheduler)
100
+ scheduler_klass_name = scheduler_kwargs.pop("_class_name")
101
+ scheduler_klass = getattr(diffusers, scheduler_klass_name, None)
102
+ scheduler = scheduler_klass(**scheduler_kwargs)
103
+ return scheduler
104
+
105
+ def padf(tensor, mult=3):
106
+ pad = 2**mult - (tensor.shape[-1] % 2**mult)
107
+ pad = pad//2
108
+ tensor = nn.functional.pad(tensor, (pad, pad, pad, pad, 0, 0), mode='replicate')
109
+ return tensor, pad
110
+
111
+ def unpadf(tensor, pad=1):
112
+ return tensor[..., pad:-pad, pad:-pad]
113
+
114
+ def pad_reshape(tensor, mult=3):
115
+ tensor, pad = padf(tensor, mult=mult)
116
+ tensor = rearrange(tensor, "b c t h w -> b t c h w")
117
+ return tensor, pad
118
+
119
+ def unpad_reshape(tensor, pad=1):
120
+ tensor = rearrange(tensor, "b t c h w -> b c t h w")
121
+ tensor = unpadf(tensor, pad=pad)
122
+ return tensor
123
+
124
+ class Context:
125
+ def __init__(self, lidm_path, lvdm_path, vae_path, config_path):
126
+ self.lidm, self.lidm_config = load_model(lidm_path)
127
+ self.lvdm, self.lvdm_config = load_model(lvdm_path)
128
+ self.vae, self.vae_config = load_model(vae_path)
129
+ self.config = OmegaConf.load(config_path)
130
+ self.models = [self.lidm, self.lvdm, self.vae]
131
+ self.scheduler = load_scheduler(self.config)
132
+
133
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134
+ self.dtype = torch.float32
135
+
136
+ for model in self.models:
137
+ model.to(self.device, dtype=self.dtype)
138
+ model.eval()
139
+
140
+ print("Models loaded")
141
+
142
+ def get_img(self, steps):
143
+ print("generating image")
144
+ self.scheduler.set_timesteps(steps)
145
+ with torch.no_grad():
146
+ B, C, H, W = 1, self.lidm_config["in_channels"], self.lidm_config["sample_size"], self.lidm_config["sample_size"]
147
+
148
+ timesteps = self.scheduler.timesteps
149
+ forward_kwargs = {}
150
+
151
+ latents = torch.randn((B, C, H, W), device=self.device, dtype=self.dtype)
152
+ with torch.autocast("cuda"):
153
+ for t in tqdm(timesteps):
154
+ forward_kwargs["timestep"] = t
155
+ latent_model_input = latents
156
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep=t)
157
+ latent_model_input, padding = padf(latent_model_input, mult=3)
158
+ noise_pred = self.lidm(latent_model_input, **forward_kwargs).sample
159
+ noise_pred = unpadf(noise_pred, pad=padding)
160
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
161
+ # latent shape[B,C,H,W]
162
+ latents = latents / self.vae.config.scaling_factor
163
+ img = self.vae.decode(latents).sample
164
+ img = (img + 1) * 128 # [-1, 1] -> [0, 256]
165
+ img = img.mean(1).unsqueeze(1).repeat([1, 3, 1, 1])
166
+ img = img.clamp(0, 255).to(torch.uint8).cpu().numpy()
167
+ img = img[0].transpose(1, 2, 0)
168
+ img = Image.fromarray(img)
169
+
170
+ return img, latents
171
+
172
+ def get_vid(self, lvef: int, ref_latent: torch.Tensor, steps: int):
173
+ print("generating video")
174
+ self.scheduler.set_timesteps(steps)
175
+
176
+ with torch.no_grad():
177
+ B, C, T, H, W = 1, 4, self.lvdm_config["num_frames"], self.lvdm_config["sample_size"], self.lvdm_config["sample_size"]
178
+
179
+ if FRAMES > T:
180
+ OT = T//2 # overlap 64//2
181
+ TR = (FRAMES - T) / 32 # total frames (192 - 64) / 32 = 4
182
+ TR = int(TR + 1) # total repetitions
183
+ NT = (T-OT) * TR + OT
184
+ else:
185
+ OT = 0
186
+ TR = 1
187
+ NT = T
188
+
189
+ timesteps = self.scheduler.timesteps
190
+
191
+ lvef = lvef / 100
192
+ lvef = torch.tensor([lvef]*TR, device=self.device, dtype=self.dtype)
193
+ lvef = lvef[:, None, None]
194
+ print(lvef.shape)
195
+
196
+ forward_kwargs = {}
197
+ forward_kwargs["added_time_ids"] = torch.zeros((B*TR, self.config.unet.addition_time_embed_dim), device=self.device, dtype=self.dtype)
198
+ forward_kwargs["encoder_hidden_states"] = lvef
199
+ print(forward_kwargs["added_time_ids"].shape)
200
+
201
+ latent_cond_images = ref_latent * self.vae.config.scaling_factor
202
+ latent_cond_images = latent_cond_images[:,:,None,:,:].repeat([1, 1, NT, 1, 1]).to(self.device, dtype=self.dtype)
203
+ print(latent_cond_images.shape)
204
+
205
+ latents = torch.randn((B, C, NT, H, W), device=self.device, dtype=self.dtype)
206
+ print(latents.shape)
207
+
208
+ with torch.autocast("cuda"):
209
+ for t in tqdm(timesteps):
210
+ forward_kwargs["timestep"] = t
211
+ latent_model_input = latents
212
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep=t)
213
+ latent_model_input = torch.cat((latent_model_input, latent_cond_images), dim=1) # B x 2C x T x H x W
214
+ latent_model_input, padding = pad_reshape(latent_model_input, mult=3) # B x T x 2C x H+P x W+P
215
+
216
+ inputs = torch.cat([latent_model_input[:,r*(T-OT):r*(T-OT)+T] for r in range(TR)], dim=0) # B*TR x T x 2C x H+P x W+P
217
+ noise_pred = self.lvdm(inputs, **forward_kwargs).sample
218
+ outputs = torch.chunk(noise_pred, TR, dim=0) # TR x B x T x C x H x W
219
+ noise_predictions = []
220
+ for r in range(TR):
221
+ noise_predictions.append(outputs[r] if r == 0 else outputs[r][:,OT:])
222
+ noise_pred = torch.cat(noise_predictions, dim=1) # B x NT x C x H x W
223
+ noise_pred = unpad_reshape(noise_pred, pad=padding)
224
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
225
+
226
+ print("done generating noise")
227
+ # latent shape[B,C,T,H,W]
228
+ latents = latents / self.vae.config.scaling_factor
229
+ latents = rearrange(latents, "b c t h w -> (b t) c h w")
230
+
231
+ chunk_size = 16
232
+ chunked_latents = torch.split(latents, chunk_size, dim=0)
233
+ decoded_chunks = []
234
+ for chunk in chunked_latents:
235
+ decoded_chunks.append(self.vae.decode(chunk.float().cuda()).sample.cpu())
236
+ video = torch.cat(decoded_chunks, dim=0) # (B*T) x H x W x C
237
+ video = rearrange(video, "(b t) c h w -> b t h w c", b=B)[0] # T H W C
238
+ video = (video + 1) * 128 # [-1, 1] -> [0, 256]
239
+ video = video.mean(-1).unsqueeze(-1).repeat([1, 1, 1, 3]) # T H W 3
240
+ video = video.clamp(0, 255).to(torch.uint8).cpu().numpy()
241
+ out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), FPS, (112, 112))
242
+ for img in video:
243
+ out.write(img)
244
+ out.release()
245
+
246
+ return "output.mp4"
247
+
248
+
249
+ ctx = Context(
250
+ lidm_path="resources/lidm",
251
+ lvdm_path="resources/lvdm",
252
+ vae_path="resources/ivae",
253
+ config_path="resources/config.yaml"
254
+ )
255
+
256
+ with gr.Blocks(css=mycss, js=myjs) as demo:
257
+ with gr.Row():
258
+ # Greet user with an explanation of the demo
259
+ gr.Markdown("""
260
+ # EchoNet-Synthetic: Privacy-preserving Video Generation for Safe Medical Data Sharing
261
+ This demo is attached to a paper under review at MICCAI 2024, and is targeted at the reviewers of that paper.
262
+
263
+ 1. Start by generating an image using the "Generate Image" button. This will generate a random image, similar to the EchoNet-Dynamic dataset.
264
+ 2. Adjust the "Ejection Fraction Score" slider to change the ejection fraction of the generated image.
265
+ 3. Generate a video using the "Generate Video" button. This will generate a video from the generated image, with the ejection fraction score you chose.
266
+
267
+ We leave the ejection fraction input completely open, so you can see how the video generation changes with different ejection fraction scores, even unrealistic ones. The normal ejection fraction range is 50-75.<br>
268
+ We recommend 64 steps for ideal image quality, but you can adjust this to see how it affects the video generation.
269
+
270
+ """)
271
+
272
+ with gr.Row():
273
+ # core activity
274
+ # 3 columns
275
+ with gr.Column():
276
+ # Image generation goes here
277
+ img = gr.Image(interactive=False, label="Generated Image") # allow user upload
278
+ img_btn = gr.Button("Generate Image")
279
+
280
+ with gr.Column():
281
+ # LVEF slider goes here
282
+ # Add an big arrow image for show
283
+ gr.HTML(myhtml)
284
+ efslider = gr.Slider(minimum=0, maximum=100, value=65, step=1, label="Ejection Fraction Score (%)") #
285
+ dsslider = gr.Slider(minimum=1, maximum=999, value=64, step=1, label="Sampling Steps") #
286
+ pass
287
+
288
+ with gr.Column():
289
+ # Video generation goes here
290
+ vid = gr.Video(interactive=False, autoplay=True, label="Generated Video")
291
+ vid_btn = gr.Button("Generate Video")
292
+
293
+ with gr.Row():
294
+ # Additional informations
295
+ gr.Examples(
296
+ examples=[[f"resources/examples/ef{i}.png", f"resources/examples/ef{i}.mp4", i, 64] for i in [20, 30, 40, 50, 60, 70, 80, 90]],
297
+ inputs=[img, vid, efslider, dsslider],
298
+ outputs=None,
299
+ fn=None,
300
+ cache_examples=False,
301
+ )
302
+
303
+
304
+ ltt_img = gr.State() # latent image state
305
+
306
+ img.change() # apply center-cropping
307
+ img_btn.click(fn=ctx.get_img, inputs=[dsslider], outputs=[img, ltt_img]) # generate image with lidm
308
+
309
+ vid_btn.click(fn=ctx.get_vid, inputs=[efslider, ltt_img, dsslider], outputs=[vid]) # generate video with lvdm
310
+
311
+ demo.launch(share=False)
output.mp4 ADDED
Binary file (143 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.1
2
+ torchvision==0.16.1
3
+ diffusers==0.24.0
4
+ einops==0.7.0
5
+ accelerate==0.25.0
6
+ opencv-python==4.8.1.78
7
+ pillow==10.1.0
8
+ omegaconf==2.3.0
9
+ tqdm==4.66.1
10
+ gradio==4.19.2
resources/config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+
5
+ unet:
6
+ _class_name: UNetSpatioTemporalConditionModel
7
+ addition_time_embed_dim: 1
8
+ block_out_channels:
9
+ - 128
10
+ - 256
11
+ - 256
12
+ - 512
13
+ cross_attention_dim: 1
14
+ down_block_types:
15
+ - CrossAttnDownBlockSpatioTemporal
16
+ - CrossAttnDownBlockSpatioTemporal
17
+ - CrossAttnDownBlockSpatioTemporal
18
+ - DownBlockSpatioTemporal
19
+ in_channels: 8
20
+ layers_per_block: 2
21
+ num_attention_heads:
22
+ - 8
23
+ - 16
24
+ - 16
25
+ - 32
26
+ num_frames: ${globals.target_nframes}
27
+ out_channels: 4
28
+ projection_class_embeddings_input_dim: 1
29
+ sample_size: 14
30
+ transformer_layers_per_block: 1
31
+ up_block_types:
32
+ - UpBlockSpatioTemporal
33
+ - CrossAttnUpBlockSpatioTemporal
34
+ - CrossAttnUpBlockSpatioTemporal
35
+ - CrossAttnUpBlockSpatioTemporal
36
+
37
+ noise_scheduler:
38
+ _class_name: DDPMScheduler
39
+ num_train_timesteps: 1000
40
+ beta_start: 0.0001
41
+ beta_end: 0.02
42
+ beta_schedule: linear # linear, scaled_linear, or squaredcos_cap_v2
43
+ variance_type: fixed_small # fixed_small, fixed_small_log, fixed_large, fixed_large_log, learned or learned_range
44
+ clip_sample: true
45
+ clip_sample_range: 4.0 # default 1
46
+ prediction_type: v_prediction # epsilon, sample, v_prediction
47
+ thresholding: false # do not touch
48
+ dynamic_thresholding_ratio: 0.995 # unused
49
+ sample_max_value: 1.0 # unused
50
+ timestep_spacing: "leading" #
51
+ steps_offset: 0 # unused
resources/examples/ef20.mp4 ADDED
Binary file (90.5 kB). View file
 
resources/examples/ef20.png ADDED
resources/examples/ef30.mp4 ADDED
Binary file (118 kB). View file
 
resources/examples/ef30.png ADDED
resources/examples/ef40.mp4 ADDED
Binary file (125 kB). View file
 
resources/examples/ef40.png ADDED
resources/examples/ef50.mp4 ADDED
Binary file (124 kB). View file
 
resources/examples/ef50.png ADDED
resources/examples/ef60.mp4 ADDED
Binary file (129 kB). View file
 
resources/examples/ef60.png ADDED
resources/examples/ef70.mp4 ADDED
Binary file (95.7 kB). View file
 
resources/examples/ef70.png ADDED
resources/examples/ef80.mp4 ADDED
Binary file (137 kB). View file
 
resources/examples/ef80.png ADDED
resources/examples/ef90.mp4 ADDED
Binary file (152 kB). View file
 
resources/examples/ef90.png ADDED
resources/ivae/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.23.1",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 256,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "norm_num_groups": 32,
22
+ "out_channels": 3,
23
+ "sample_size": 512,
24
+ "scaling_factor": 0.18215,
25
+ "up_block_types": [
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D"
30
+ ]
31
+ }
resources/ivae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0be1555511d1e145bfda156062aab744c6f7fc12e930c78c3640baf8183d5b
3
+ size 249675844
resources/lidm/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DModel",
3
+ "_diffusers_version": "0.24.0",
4
+ "act_fn": "silu",
5
+ "add_attention": true,
6
+ "attention_head_dim": 8,
7
+ "attn_norm_num_groups": null,
8
+ "block_out_channels": [
9
+ 128,
10
+ 256,
11
+ 256,
12
+ 512
13
+ ],
14
+ "center_input_sample": false,
15
+ "class_embed_type": null,
16
+ "decay": 0.9999,
17
+ "down_block_types": [
18
+ "AttnDownBlock2D",
19
+ "AttnDownBlock2D",
20
+ "AttnDownBlock2D",
21
+ "DownBlock2D"
22
+ ],
23
+ "downsample_padding": 1,
24
+ "downsample_type": "resnet",
25
+ "dropout": 0.0,
26
+ "flip_sin_to_cos": true,
27
+ "freq_shift": 0,
28
+ "in_channels": 4,
29
+ "inv_gamma": 1.0,
30
+ "layers_per_block": 2,
31
+ "mid_block_scale_factor": 1,
32
+ "min_decay": 0.0,
33
+ "norm_eps": 1e-05,
34
+ "norm_num_groups": 32,
35
+ "num_class_embeds": null,
36
+ "num_train_timesteps": null,
37
+ "optimization_step": 250000,
38
+ "out_channels": 4,
39
+ "power": 0.6666666666666666,
40
+ "resnet_time_scale_shift": "default",
41
+ "sample_size": 14,
42
+ "time_embedding_type": "positional",
43
+ "up_block_types": [
44
+ "UpBlock2D",
45
+ "AttnUpBlock2D",
46
+ "AttnUpBlock2D",
47
+ "AttnUpBlock2D"
48
+ ],
49
+ "update_after_step": 0,
50
+ "upsample_type": "resnet",
51
+ "use_ema_warmup": false
52
+ }
resources/lidm/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d93d2de24f4f081919dd946a03cf7a59befcc4f711e4a983f2c5b15be45920
3
+ size 294245640
resources/lvdm/config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetSpatioTemporalConditionModel",
3
+ "_diffusers_version": "0.24.0",
4
+ "addition_time_embed_dim": 1,
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 256,
9
+ 512
10
+ ],
11
+ "cross_attention_dim": 1,
12
+ "decay": 0.9999,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlockSpatioTemporal",
15
+ "CrossAttnDownBlockSpatioTemporal",
16
+ "CrossAttnDownBlockSpatioTemporal",
17
+ "DownBlockSpatioTemporal"
18
+ ],
19
+ "in_channels": 8,
20
+ "inv_gamma": 1.0,
21
+ "layers_per_block": 2,
22
+ "min_decay": 0.0,
23
+ "num_attention_heads": [
24
+ 8,
25
+ 16,
26
+ 16,
27
+ 32
28
+ ],
29
+ "num_frames": 64,
30
+ "optimization_step": 80000,
31
+ "out_channels": 4,
32
+ "power": 0.6666666666666666,
33
+ "projection_class_embeddings_input_dim": 1,
34
+ "sample_size": 14,
35
+ "transformer_layers_per_block": 1,
36
+ "up_block_types": [
37
+ "UpBlockSpatioTemporal",
38
+ "CrossAttnUpBlockSpatioTemporal",
39
+ "CrossAttnUpBlockSpatioTemporal",
40
+ "CrossAttnUpBlockSpatioTemporal"
41
+ ],
42
+ "update_after_step": 0,
43
+ "use_ema_warmup": false
44
+ }
resources/lvdm/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:452750040eb2183d04b5547ae9493e555246cc70c11de7d9d4897377811c520e
3
+ size 575506960