Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- demo.py +25 -33
- requirements.txt +2 -1
- sample_videos/t2v-temp.mp4 +0 -0
demo.py
CHANGED
@@ -12,7 +12,7 @@ from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
|
|
12 |
from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
|
13 |
from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
|
14 |
from omegaconf import OmegaConf
|
15 |
-
from transformers import T5EncoderModel, T5Tokenizer
|
16 |
|
17 |
import os, sys
|
18 |
sys.path.append(os.path.split(sys.path[0])[0])
|
@@ -38,7 +38,11 @@ if args.enable_vae_temporal_decoder:
|
|
38 |
else:
|
39 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
|
40 |
tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
|
41 |
-
text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# set eval mode
|
44 |
transformer_model.eval()
|
@@ -120,16 +124,28 @@ def gen_video(text_input, sample_method, scfg_scale, seed, height, width, video_
|
|
120 |
beta_end=args.beta_end,
|
121 |
beta_schedule=args.beta_schedule,
|
122 |
variance_type=args.variance_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
|
125 |
videogen_pipeline = LattePipeline(vae=vae,
|
126 |
-
text_encoder=text_encoder,
|
|
|
127 |
tokenizer=tokenizer,
|
128 |
scheduler=scheduler,
|
129 |
transformer=transformer_model).to(device)
|
130 |
# videogen_pipeline.enable_xformers_memory_efficient_attention()
|
131 |
|
132 |
-
videos = videogen_pipeline(
|
|
|
|
|
|
|
|
|
133 |
video_length=video_length,
|
134 |
height=height,
|
135 |
width=width,
|
@@ -185,26 +201,12 @@ with gr.Blocks() as demo:
|
|
185 |
with gr.Column(visible=True) as input_raws:
|
186 |
with gr.Row():
|
187 |
with gr.Column(scale=1.0):
|
188 |
-
# text_input = gr.Textbox(show_label=True, interactive=True, label="Text prompt").style(container=False)
|
189 |
text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
|
190 |
-
|
191 |
-
# with gr.Column(scale=0.5):
|
192 |
-
# image_input = gr.Image(show_label=True, interactive=True, label="Reference image").style(container=False)
|
193 |
-
# with gr.Column(scale=0.5):
|
194 |
-
# preframe_input = gr.Image(show_label=True, interactive=True, label="First frame").style(container=False)
|
195 |
with gr.Row():
|
196 |
with gr.Column(scale=0.5):
|
197 |
sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
|
198 |
-
|
199 |
-
# with gr.Column(scale=1.0):
|
200 |
-
# video_length = gr.Slider(
|
201 |
-
# minimum=1,
|
202 |
-
# maximum=24,
|
203 |
-
# value=1,
|
204 |
-
# step=1,
|
205 |
-
# interactive=True,
|
206 |
-
# label="Video Length (1 for T2I and 16 for T2V)",
|
207 |
-
# )
|
208 |
with gr.Column(scale=0.5):
|
209 |
video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
|
210 |
with gr.Row():
|
@@ -260,21 +262,11 @@ with gr.Blocks() as demo:
|
|
260 |
|
261 |
|
262 |
with gr.Column(scale=0.6, visible=True) as video_upload:
|
263 |
-
# with gr.Column(visible=True) as video_upload:
|
264 |
output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
|
265 |
-
# with gr.Column(elem_id="image", scale=0.5) as img_part:
|
266 |
-
# with gr.Tab("Video", elem_id='video_tab'):
|
267 |
-
|
268 |
-
# with gr.Tab("Image", elem_id='image_tab'):
|
269 |
-
# up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload").style(height=360)
|
270 |
-
# upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
|
271 |
-
# clear = gr.Button("Restart")
|
272 |
|
273 |
with gr.Row():
|
274 |
with gr.Column(scale=1.0, min_width=0):
|
275 |
-
run = gr.Button("
|
276 |
-
# with gr.Column(scale=0.5, min_width=0):
|
277 |
-
# clear = gr.Button("🔄Clear️")
|
278 |
|
279 |
EXAMPLES = [
|
280 |
["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
|
@@ -291,8 +283,8 @@ with gr.Blocks() as demo:
|
|
291 |
fn = gen_video,
|
292 |
inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
|
293 |
outputs=[output],
|
294 |
-
|
295 |
-
cache_examples="lazy",
|
296 |
)
|
297 |
|
298 |
run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
|
|
|
12 |
from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
|
13 |
from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
|
14 |
from omegaconf import OmegaConf
|
15 |
+
from transformers import T5EncoderModel, T5Tokenizer, BitsAndBytesConfig
|
16 |
|
17 |
import os, sys
|
18 |
sys.path.append(os.path.split(sys.path[0])[0])
|
|
|
38 |
else:
|
39 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
|
40 |
tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
|
41 |
+
text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
|
42 |
+
subfolder="text_encoder",
|
43 |
+
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
|
44 |
+
device_map="auto",
|
45 |
+
)
|
46 |
|
47 |
# set eval mode
|
48 |
transformer_model.eval()
|
|
|
124 |
beta_end=args.beta_end,
|
125 |
beta_schedule=args.beta_schedule,
|
126 |
variance_type=args.variance_type)
|
127 |
+
|
128 |
+
pipe_tmp = LattePipeline.from_pretrained(
|
129 |
+
args.pretrained_model_path,
|
130 |
+
transformer=None,
|
131 |
+
text_encoder=text_encoder,
|
132 |
+
device_map="balanced",)
|
133 |
+
prompt_embeds, negative_prompt_embeds = pipe_tmp.encode_prompt(text_input, negative_prompt="")
|
134 |
|
135 |
|
136 |
videogen_pipeline = LattePipeline(vae=vae,
|
137 |
+
# text_encoder=text_encoder,
|
138 |
+
text_encoder=None,
|
139 |
tokenizer=tokenizer,
|
140 |
scheduler=scheduler,
|
141 |
transformer=transformer_model).to(device)
|
142 |
# videogen_pipeline.enable_xformers_memory_efficient_attention()
|
143 |
|
144 |
+
videos = videogen_pipeline(
|
145 |
+
# text_input,
|
146 |
+
prompt_embeds=prompt_embeds,
|
147 |
+
negative_prompt=None,
|
148 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
149 |
video_length=video_length,
|
150 |
height=height,
|
151 |
width=width,
|
|
|
201 |
with gr.Column(visible=True) as input_raws:
|
202 |
with gr.Row():
|
203 |
with gr.Column(scale=1.0):
|
|
|
204 |
text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
|
205 |
+
|
|
|
|
|
|
|
|
|
206 |
with gr.Row():
|
207 |
with gr.Column(scale=0.5):
|
208 |
sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
|
209 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
with gr.Column(scale=0.5):
|
211 |
video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
|
212 |
with gr.Row():
|
|
|
262 |
|
263 |
|
264 |
with gr.Column(scale=0.6, visible=True) as video_upload:
|
|
|
265 |
output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
with gr.Row():
|
268 |
with gr.Column(scale=1.0, min_width=0):
|
269 |
+
run = gr.Button(value="Generate", variant='primary')
|
|
|
|
|
270 |
|
271 |
EXAMPLES = [
|
272 |
["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
|
|
|
283 |
fn = gen_video,
|
284 |
inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
|
285 |
outputs=[output],
|
286 |
+
cache_examples=True,
|
287 |
+
# cache_examples="lazy",
|
288 |
)
|
289 |
|
290 |
run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
|
requirements.txt
CHANGED
@@ -17,4 +17,5 @@ beautifulsoup4
|
|
17 |
ftfy
|
18 |
omegaconf
|
19 |
spaces
|
20 |
-
imageio-ffmpeg
|
|
|
|
17 |
ftfy
|
18 |
omegaconf
|
19 |
spaces
|
20 |
+
imageio-ffmpeg
|
21 |
+
bitsandbytes
|
sample_videos/t2v-temp.mp4
CHANGED
Binary files a/sample_videos/t2v-temp.mp4 and b/sample_videos/t2v-temp.mp4 differ
|
|