maxin-cn commited on
Commit
b1e71c1
1 Parent(s): 4051d56

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. demo.py +25 -33
  2. requirements.txt +2 -1
  3. sample_videos/t2v-temp.mp4 +0 -0
demo.py CHANGED
@@ -12,7 +12,7 @@ from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
12
  from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
13
  from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
14
  from omegaconf import OmegaConf
15
- from transformers import T5EncoderModel, T5Tokenizer
16
 
17
  import os, sys
18
  sys.path.append(os.path.split(sys.path[0])[0])
@@ -38,7 +38,11 @@ if args.enable_vae_temporal_decoder:
38
  else:
39
  vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
40
  tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
41
- text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=torch.float16).to(device)
 
 
 
 
42
 
43
  # set eval mode
44
  transformer_model.eval()
@@ -120,16 +124,28 @@ def gen_video(text_input, sample_method, scfg_scale, seed, height, width, video_
120
  beta_end=args.beta_end,
121
  beta_schedule=args.beta_schedule,
122
  variance_type=args.variance_type)
 
 
 
 
 
 
 
123
 
124
 
125
  videogen_pipeline = LattePipeline(vae=vae,
126
- text_encoder=text_encoder,
 
127
  tokenizer=tokenizer,
128
  scheduler=scheduler,
129
  transformer=transformer_model).to(device)
130
  # videogen_pipeline.enable_xformers_memory_efficient_attention()
131
 
132
- videos = videogen_pipeline(text_input,
 
 
 
 
133
  video_length=video_length,
134
  height=height,
135
  width=width,
@@ -185,26 +201,12 @@ with gr.Blocks() as demo:
185
  with gr.Column(visible=True) as input_raws:
186
  with gr.Row():
187
  with gr.Column(scale=1.0):
188
- # text_input = gr.Textbox(show_label=True, interactive=True, label="Text prompt").style(container=False)
189
  text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
190
- # with gr.Row():
191
- # with gr.Column(scale=0.5):
192
- # image_input = gr.Image(show_label=True, interactive=True, label="Reference image").style(container=False)
193
- # with gr.Column(scale=0.5):
194
- # preframe_input = gr.Image(show_label=True, interactive=True, label="First frame").style(container=False)
195
  with gr.Row():
196
  with gr.Column(scale=0.5):
197
  sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
198
- # with gr.Row():
199
- # with gr.Column(scale=1.0):
200
- # video_length = gr.Slider(
201
- # minimum=1,
202
- # maximum=24,
203
- # value=1,
204
- # step=1,
205
- # interactive=True,
206
- # label="Video Length (1 for T2I and 16 for T2V)",
207
- # )
208
  with gr.Column(scale=0.5):
209
  video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
210
  with gr.Row():
@@ -260,21 +262,11 @@ with gr.Blocks() as demo:
260
 
261
 
262
  with gr.Column(scale=0.6, visible=True) as video_upload:
263
- # with gr.Column(visible=True) as video_upload:
264
  output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
265
- # with gr.Column(elem_id="image", scale=0.5) as img_part:
266
- # with gr.Tab("Video", elem_id='video_tab'):
267
-
268
- # with gr.Tab("Image", elem_id='image_tab'):
269
- # up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload").style(height=360)
270
- # upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
271
- # clear = gr.Button("Restart")
272
 
273
  with gr.Row():
274
  with gr.Column(scale=1.0, min_width=0):
275
- run = gr.Button("💭Run")
276
- # with gr.Column(scale=0.5, min_width=0):
277
- # clear = gr.Button("🔄Clear️")
278
 
279
  EXAMPLES = [
280
  ["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
@@ -291,8 +283,8 @@ with gr.Blocks() as demo:
291
  fn = gen_video,
292
  inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
293
  outputs=[output],
294
- # cache_examples=True,
295
- cache_examples="lazy",
296
  )
297
 
298
  run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
 
12
  from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
13
  from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
14
  from omegaconf import OmegaConf
15
+ from transformers import T5EncoderModel, T5Tokenizer, BitsAndBytesConfig
16
 
17
  import os, sys
18
  sys.path.append(os.path.split(sys.path[0])[0])
 
38
  else:
39
  vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
40
  tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
41
+ text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
42
+ subfolder="text_encoder",
43
+ quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
44
+ device_map="auto",
45
+ )
46
 
47
  # set eval mode
48
  transformer_model.eval()
 
124
  beta_end=args.beta_end,
125
  beta_schedule=args.beta_schedule,
126
  variance_type=args.variance_type)
127
+
128
+ pipe_tmp = LattePipeline.from_pretrained(
129
+ args.pretrained_model_path,
130
+ transformer=None,
131
+ text_encoder=text_encoder,
132
+ device_map="balanced",)
133
+ prompt_embeds, negative_prompt_embeds = pipe_tmp.encode_prompt(text_input, negative_prompt="")
134
 
135
 
136
  videogen_pipeline = LattePipeline(vae=vae,
137
+ # text_encoder=text_encoder,
138
+ text_encoder=None,
139
  tokenizer=tokenizer,
140
  scheduler=scheduler,
141
  transformer=transformer_model).to(device)
142
  # videogen_pipeline.enable_xformers_memory_efficient_attention()
143
 
144
+ videos = videogen_pipeline(
145
+ # text_input,
146
+ prompt_embeds=prompt_embeds,
147
+ negative_prompt=None,
148
+ negative_prompt_embeds=negative_prompt_embeds,
149
  video_length=video_length,
150
  height=height,
151
  width=width,
 
201
  with gr.Column(visible=True) as input_raws:
202
  with gr.Row():
203
  with gr.Column(scale=1.0):
 
204
  text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
205
+
 
 
 
 
206
  with gr.Row():
207
  with gr.Column(scale=0.5):
208
  sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
209
+
 
 
 
 
 
 
 
 
 
210
  with gr.Column(scale=0.5):
211
  video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
212
  with gr.Row():
 
262
 
263
 
264
  with gr.Column(scale=0.6, visible=True) as video_upload:
 
265
  output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
 
 
 
 
 
 
 
266
 
267
  with gr.Row():
268
  with gr.Column(scale=1.0, min_width=0):
269
+ run = gr.Button(value="Generate", variant='primary')
 
 
270
 
271
  EXAMPLES = [
272
  ["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
 
283
  fn = gen_video,
284
  inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
285
  outputs=[output],
286
+ cache_examples=True,
287
+ # cache_examples="lazy",
288
  )
289
 
290
  run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
requirements.txt CHANGED
@@ -17,4 +17,5 @@ beautifulsoup4
17
  ftfy
18
  omegaconf
19
  spaces
20
- imageio-ffmpeg
 
 
17
  ftfy
18
  omegaconf
19
  spaces
20
+ imageio-ffmpeg
21
+ bitsandbytes
sample_videos/t2v-temp.mp4 CHANGED
Binary files a/sample_videos/t2v-temp.mp4 and b/sample_videos/t2v-temp.mp4 differ