dance-monkey

Running

App Files Files Community

guardiancc commited on Sep 2, 2024

Commit

f3672f8

verified ·

1 Parent(s): 46da058

Update mimicmotion/pipelines/pipeline_mimicmotion.py

Browse files

Files changed (1) hide show

mimicmotion/pipelines/pipeline_mimicmotion.py +43 -38

mimicmotion/pipelines/pipeline_mimicmotion.py CHANGED Viewed

@@ -222,40 +222,33 @@ class MimicMotionPipeline(DiffusionPipeline):
         decode_chunk_size: int = 8):
         # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
         latents = latents.flatten(0, 1)
         latents = 1 / self.vae.config.scaling_factor * latents
         forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
         accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
-        # Função auxiliar para processar um chunk de frames
-        def process_chunk(start, end, frames_list):
-            decode_kwargs = {}
-            if accepts_num_frames:
-                decode_kwargs["num_frames"] = end - start
-            frame = self.vae.decode(latents[start:end], **decode_kwargs).sample
-            frames_list.append(frame.cpu())
-        threads = []
         frames = []
-        # Dividindo o trabalho em chunks e criando threads para processá-los
         for i in range(0, latents.shape[0], decode_chunk_size):
-            t = threading.Thread(target=process_chunk, args=(i, i + decode_chunk_size, frames))
-            threads.append(t)
-            t.start()
-        # Aguardando todas as threads terminarem
-        for t in threads:
-            t.join()
-        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
         frames = torch.cat(frames, dim=0)
         frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
-        # Cast para float32 para compatibilidade com bfloat16
         frames = frames.float()
         return frames
     def check_inputs(self, image, height, width):
         if (
                 not isinstance(image, torch.Tensor)
@@ -563,17 +556,21 @@ class MimicMotionPipeline(DiffusionPipeline):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # Concatenate image_latents over channels dimension
                 latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
                 # predict the noise residual
                 noise_pred = torch.zeros_like(image_latents)
                 noise_pred_cnt = image_latents.new_zeros((num_frames,))
                 weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size
                 weight = torch.minimum(weight, 2 - weight)
-                for idx in indices:
                     # classification-free inference
                     pose_latents = self.pose_net(image_pose[idx].to(device))
                     _noise_pred = self.unet(
@@ -585,8 +582,8 @@ class MimicMotionPipeline(DiffusionPipeline):
                         image_only_indicator=image_only_indicator,
                         return_dict=False,
                     )[0]
-                    noise_pred[:1, idx] += _noise_pred * weight[:, None, None, None]
                     # normal inference
                     _noise_pred = self.unet(
                         latent_model_input[1:, idx],
@@ -597,26 +594,34 @@ class MimicMotionPipeline(DiffusionPipeline):
                         image_only_indicator=image_only_indicator,
                         return_dict=False,
                     )[0]
-                    noise_pred[1:, idx] += _noise_pred * weight[:, None, None, None]
-                    noise_pred_cnt[idx] += weight
-                    progress_bar.update()
                 noise_pred.div_(noise_pred_cnt[:, None, None, None])
                 # perform guidance
                 if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
                     for k in callback_on_step_end_tensor_inputs:
                         callback_kwargs[k] = locals()[k]
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
                     latents = callback_outputs.pop("latents", latents)
         self.pose_net.cpu()

         decode_chunk_size: int = 8):
         # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
         latents = latents.flatten(0, 1)
         latents = 1 / self.vae.config.scaling_factor * latents
         forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
         accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
         frames = []
         for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame.cpu())
         frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
         frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         frames = frames.float()
         return frames
     def check_inputs(self, image, height, width):
         if (
                 not isinstance(image, torch.Tensor)
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # Concatenate image_latents over channels dimension
                 latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
                 # predict the noise residual
                 noise_pred = torch.zeros_like(image_latents)
                 noise_pred_cnt = image_latents.new_zeros((num_frames,))
                 weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size
                 weight = torch.minimum(weight, 2 - weight)
+                # Paralelização do loop sobre `indices` usando ThreadPoolExecutor
+                def process_index(idx):
+                    nonlocal noise_pred, noise_pred_cnt
+                    result = torch.zeros_like(image_latents[:1, idx])  # Placeholder for thread-safe accumulation
                     # classification-free inference
                     pose_latents = self.pose_net(image_pose[idx].to(device))
                     _noise_pred = self.unet(
                         image_only_indicator=image_only_indicator,
                         return_dict=False,
                     )[0]
+                    result[:1] += _noise_pred * weight[:, None, None, None]
                     # normal inference
                     _noise_pred = self.unet(
                         latent_model_input[1:, idx],
                         image_only_indicator=image_only_indicator,
                         return_dict=False,
                     )[0]
+                    result[1:] += _noise_pred * weight[:, None, None, None]
+                    return result, idx
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    futures = [executor.submit(process_index, idx) for idx in indices]
+                    for future in concurrent.futures.as_completed(futures):
+                        _noise_pred, idx = future.result()
+                        noise_pred[:, idx] += _noise_pred
+                        noise_pred_cnt[idx] += weight
+                        progress_bar.update()
                 noise_pred.div_(noise_pred_cnt[:, None, None, None])
                 # perform guidance
                 if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
                     for k in callback_on_step_end_tensor_inputs:
                         callback_kwargs[k] = locals()[k]
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
                     latents = callback_outputs.pop("latents", latents)
         self.pose_net.cpu()