Spaces:

BestWishYsh
/

MagicTime

Running on A10G

App Files Files Community

MagicTime / utils /util.py

BestWishYsh

Upload 85 files

8a8dad9 verified 10 months ago

raw

history blame contribute delete

32.3 kB

	import os
	import imageio
	import numpy as np
	from tqdm import tqdm
	from typing import Union
	from einops import rearrange
	from safetensors import safe_open
	from transformers import CLIPTextModel
	import torch
	import torchvision
	import torch.distributed as dist

	def zero_rank_print(s):
	if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0): print("### " + s)

	def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
	videos = rearrange(videos, "b c t h w -> t b c h w")
	outputs = []
	for x in videos:
	x = torchvision.utils.make_grid(x, nrow=n_rows)
	x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
	if rescale:
	x = (x + 1.0) / 2.0 # -1,1 -> 0,1
	x = (x * 255).numpy().astype(np.uint8)
	outputs.append(x)

	os.makedirs(os.path.dirname(path), exist_ok=True)
	imageio.mimsave(path, outputs, fps=fps)

	# DDIM Inversion
	@torch.no_grad()
	def init_prompt(prompt, pipeline):
	uncond_input = pipeline.tokenizer(
	[""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
	return_tensors="pt"
	)
	uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
	text_input = pipeline.tokenizer(
	[prompt],
	padding="max_length",
	max_length=pipeline.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
	context = torch.cat([uncond_embeddings, text_embeddings])

	return context

	def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
	sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
	timestep, next_timestep = min(
	timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
	alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
	alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
	beta_prod_t = 1 - alpha_prod_t
	next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
	next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
	next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
	return next_sample

	def get_noise_pred_single(latents, t, context, unet):
	noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
	return noise_pred

	@torch.no_grad()
	def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
	context = init_prompt(prompt, pipeline)
	uncond_embeddings, cond_embeddings = context.chunk(2)
	all_latent = [latent]
	latent = latent.clone().detach()
	for i in tqdm(range(num_inv_steps)):
	t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
	noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
	latent = next_step(noise_pred, t, latent, ddim_scheduler)
	all_latent.append(latent)
	return all_latent

	@torch.no_grad()
	def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
	ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt)
	return ddim_latents

	def load_weights(
	magictime_pipeline,
	motion_module_path = "",
	dreambooth_model_path = "",
	magic_adapter_s_path = "",
	magic_adapter_t_path = "",
	magic_text_encoder_path = "",
	):
	# motion module
	unet_state_dict = {}
	if motion_module_path != "":
	print(f"load motion module from {motion_module_path}")
	try:
	motion_module_state_dict = torch.load(motion_module_path, map_location="cpu")
	if "state_dict" in motion_module_state_dict:
	motion_module_state_dict = motion_module_state_dict["state_dict"]
	for name, param in motion_module_state_dict.items():
	if "motion_modules." in name:
	modified_name = name.removeprefix('module.') if name.startswith('module.') else name
	unet_state_dict[modified_name] = param
	except Exception as e:
	print(f"Error loading motion module: {e}")
	try:
	missing, unexpected = magictime_pipeline.unet.load_state_dict(unet_state_dict, strict=False)
	assert len(unexpected) == 0, f"Unexpected keys in state_dict: {unexpected}"
	del unet_state_dict
	except Exception as e:
	print(f"Error loading state dict into UNet: {e}")

	# base model
	if dreambooth_model_path != "":
	print(f"load dreambooth model from {dreambooth_model_path}")
	if dreambooth_model_path.endswith(".safetensors"):
	dreambooth_state_dict = {}
	with safe_open(dreambooth_model_path, framework="pt", device="cpu") as f:
	for key in f.keys():
	dreambooth_state_dict[key] = f.get_tensor(key)
	elif dreambooth_model_path.endswith(".ckpt"):
	dreambooth_state_dict = torch.load(dreambooth_model_path, map_location="cpu")

	# 1. vae
	converted_vae_checkpoint = convert_ldm_vae_checkpoint(dreambooth_state_dict, magictime_pipeline.vae.config)
	magictime_pipeline.vae.load_state_dict(converted_vae_checkpoint)
	# 2. unet
	converted_unet_checkpoint = convert_ldm_unet_checkpoint(dreambooth_state_dict, magictime_pipeline.unet.config)
	magictime_pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
	# 3. text_model
	magictime_pipeline.text_encoder = convert_ldm_clip_checkpoint(dreambooth_state_dict)
	del dreambooth_state_dict

	# MagicAdapter and MagicTextEncoder
	if magic_adapter_s_path != "":
	print(f"load domain lora from {magic_adapter_s_path}")
	magic_adapter_s_state_dict = torch.load(magic_adapter_s_path, map_location="cpu")
	magictime_pipeline = load_diffusers_lora(magictime_pipeline, magic_adapter_s_state_dict, alpha=1.0)

	if magic_adapter_t_path != "" or magic_text_encoder_path != "":
	from swift import Swift

	if magic_adapter_t_path != "":
	print("load lora from swift for Unet")
	Swift.from_pretrained(magictime_pipeline.unet, magic_adapter_t_path)

	if magic_text_encoder_path != "":
	print("load lora from swift for text encoder")
	Swift.from_pretrained(magictime_pipeline.text_encoder, magic_text_encoder_path)

	return magictime_pipeline

	def load_diffusers_lora(pipeline, state_dict, alpha=1.0):
	# directly update weight in diffusers model
	for key in state_dict:
	# only process lora down key
	if "up." in key: continue

	up_key = key.replace(".down.", ".up.")
	model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
	model_key = model_key.replace("to_out.", "to_out.0.")
	layer_infos = model_key.split(".")[:-1]

	curr_layer = pipeline.unet
	while len(layer_infos) > 0:
	temp_name = layer_infos.pop(0)
	curr_layer = curr_layer.__getattr__(temp_name)

	weight_down = state_dict[key] * 2
	weight_up = state_dict[up_key] * 2
	curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)

	return pipeline

	def load_diffusers_lora_unet(unet, state_dict, alpha=1.0):
	# directly update weight in diffusers model
	for key in state_dict:
	# only process lora down key
	if "up." in key: continue

	up_key = key.replace(".down.", ".up.")
	model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
	model_key = model_key.replace("to_out.", "to_out.0.")
	layer_infos = model_key.split(".")[:-1]

	curr_layer = unet
	while len(layer_infos) > 0:
	temp_name = layer_infos.pop(0)
	curr_layer = curr_layer.__getattr__(temp_name)

	weight_down = state_dict[key] * 2
	weight_up = state_dict[up_key] * 2
	curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)

	return unet

	def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
	visited = []

	# directly update weight in diffusers model
	for key in state_dict:
	# it is suggested to print out the key, it usually will be something like below
	# "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"

	# as we have set the alpha beforehand, so just skip
	if ".alpha" in key or key in visited:
	continue

	if "text" in key:
	layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
	curr_layer = pipeline.text_encoder
	else:
	layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
	curr_layer = pipeline.unet

	# find the target layer
	temp_name = layer_infos.pop(0)
	while len(layer_infos) > -1:
	try:
	curr_layer = curr_layer.__getattr__(temp_name)
	if len(layer_infos) > 0:
	temp_name = layer_infos.pop(0)
	elif len(layer_infos) == 0:
	break
	except Exception:
	if len(temp_name) > 0:
	temp_name += "_" + layer_infos.pop(0)
	else:
	temp_name = layer_infos.pop(0)

	pair_keys = []
	if "lora_down" in key:
	pair_keys.append(key.replace("lora_down", "lora_up"))
	pair_keys.append(key)
	else:
	pair_keys.append(key)
	pair_keys.append(key.replace("lora_up", "lora_down"))

	# update weight
	if len(state_dict[pair_keys[0]].shape) == 4:
	weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
	weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
	curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
	else:
	weight_up = state_dict[pair_keys[0]].to(torch.float32)
	weight_down = state_dict[pair_keys[1]].to(torch.float32)
	curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)

	# update visited list
	for item in pair_keys:
	visited.append(item)

	return pipeline

	def shave_segments(path, n_shave_prefix_segments=1):
	"""
	Removes segments. Positive values shave the first segments, negative shave the last segments.
	"""
	if n_shave_prefix_segments >= 0:
	return ".".join(path.split(".")[n_shave_prefix_segments:])
	else:
	return ".".join(path.split(".")[:n_shave_prefix_segments])

	def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside resnets to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item.replace("in_layers.0", "norm1")
	new_item = new_item.replace("in_layers.2", "conv1")

	new_item = new_item.replace("out_layers.0", "norm2")
	new_item = new_item.replace("out_layers.3", "conv2")

	new_item = new_item.replace("emb_layers.1", "time_emb_proj")
	new_item = new_item.replace("skip_connection", "conv_shortcut")

	new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

	mapping.append({"old": old_item, "new": new_item})

	return mapping

	def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside resnets to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item

	new_item = new_item.replace("nin_shortcut", "conv_shortcut")
	new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

	mapping.append({"old": old_item, "new": new_item})

	return mapping

	def renew_attention_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside attentions to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item
	mapping.append({"old": old_item, "new": new_item})
	return mapping

	def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside attentions to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item

	new_item = new_item.replace("norm.weight", "group_norm.weight")
	new_item = new_item.replace("norm.bias", "group_norm.bias")

	new_item = new_item.replace("q.weight", "query.weight")
	new_item = new_item.replace("q.bias", "query.bias")

	new_item = new_item.replace("k.weight", "key.weight")
	new_item = new_item.replace("k.bias", "key.bias")

	new_item = new_item.replace("v.weight", "value.weight")
	new_item = new_item.replace("v.bias", "value.bias")

	new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
	new_item = new_item.replace("proj_out.bias", "proj_attn.bias")

	new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

	mapping.append({"old": old_item, "new": new_item})

	return mapping

	def assign_to_checkpoint(
	paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
	):
	"""
	This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
	attention layers, and takes into account additional replacements that may arise.

	Assigns the weights to the new checkpoint.
	"""
	assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."

	# Splits the attention layers into three variables.
	if attention_paths_to_split is not None:
	for path, path_map in attention_paths_to_split.items():
	old_tensor = old_checkpoint[path]
	channels = old_tensor.shape[0] // 3

	target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)

	num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3

	old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
	query, key, value = old_tensor.split(channels // num_heads, dim=1)

	checkpoint[path_map["query"]] = query.reshape(target_shape)
	checkpoint[path_map["key"]] = key.reshape(target_shape)
	checkpoint[path_map["value"]] = value.reshape(target_shape)

	for path in paths:
	new_path = path["new"]

	# These have already been assigned
	if attention_paths_to_split is not None and new_path in attention_paths_to_split:
	continue

	# Global renaming happens here
	new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
	new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
	new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")

	if additional_replacements is not None:
	for replacement in additional_replacements:
	new_path = new_path.replace(replacement["old"], replacement["new"])

	# proj_attn.weight has to be converted from conv 1D to linear
	if "proj_attn.weight" in new_path:
	checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
	else:
	checkpoint[new_path] = old_checkpoint[path["old"]]

	def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
	"""
	Takes a state dict and a config, and returns a converted checkpoint.
	"""

	# extract state_dict for UNet
	unet_state_dict = {}
	keys = list(checkpoint.keys())

	unet_key = "model.diffusion_model."

	# at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
	if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
	print(f"Checkpoint {path} has both EMA and non-EMA weights.")
	print(
	"In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
	" weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
	)
	for key in keys:
	if key.startswith("model.diffusion_model"):
	flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
	unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
	else:
	if sum(k.startswith("model_ema") for k in keys) > 100:
	print(
	"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
	" weights (usually better for inference), please make sure to add the `--extract_ema` flag."
	)

	for key in keys:
	if key.startswith(unet_key):
	unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)

	new_checkpoint = {}

	new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
	new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
	new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
	new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]

	if config["class_embed_type"] is None:
	# No parameters to port
	...
	elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
	new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
	new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
	new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
	new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
	else:
	raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")

	new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
	new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
	new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
	new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
	new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
	new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]

	# Retrieves the keys for the input blocks only
	num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
	input_blocks = {
	layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
	for layer_id in range(num_input_blocks)
	}

	# Retrieves the keys for the middle blocks only
	num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
	middle_blocks = {
	layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
	for layer_id in range(num_middle_blocks)
	}

	# Retrieves the keys for the output blocks only
	num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
	output_blocks = {
	layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
	for layer_id in range(num_output_blocks)
	}

	for i in range(1, num_input_blocks):
	block_id = (i - 1) // (config["layers_per_block"] + 1)
	layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)

	resnets = [
	key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
	]
	attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]

	if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
	new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
	f"input_blocks.{i}.0.op.weight"
	)
	new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
	f"input_blocks.{i}.0.op.bias"
	)

	paths = renew_resnet_paths(resnets)
	meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
	assign_to_checkpoint(
	paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
	)

	if len(attentions):
	paths = renew_attention_paths(attentions)
	meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
	assign_to_checkpoint(
	paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
	)

	resnet_0 = middle_blocks[0]
	attentions = middle_blocks[1]
	resnet_1 = middle_blocks[2]

	resnet_0_paths = renew_resnet_paths(resnet_0)
	assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)

	resnet_1_paths = renew_resnet_paths(resnet_1)
	assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)

	attentions_paths = renew_attention_paths(attentions)
	meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
	assign_to_checkpoint(
	attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
	)

	for i in range(num_output_blocks):
	block_id = i // (config["layers_per_block"] + 1)
	layer_in_block_id = i % (config["layers_per_block"] + 1)
	output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
	output_block_list = {}

	for layer in output_block_layers:
	layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
	if layer_id in output_block_list:
	output_block_list[layer_id].append(layer_name)
	else:
	output_block_list[layer_id] = [layer_name]

	if len(output_block_list) > 1:
	resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
	attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]

	resnet_0_paths = renew_resnet_paths(resnets)
	paths = renew_resnet_paths(resnets)

	meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
	assign_to_checkpoint(
	paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
	)

	output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
	if ["conv.bias", "conv.weight"] in output_block_list.values():
	index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
	new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
	f"output_blocks.{i}.{index}.conv.weight"
	]
	new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
	f"output_blocks.{i}.{index}.conv.bias"
	]

	# Clear attentions as they have been attributed above.
	if len(attentions) == 2:
	attentions = []

	if len(attentions):
	paths = renew_attention_paths(attentions)
	meta_path = {
	"old": f"output_blocks.{i}.1",
	"new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
	}
	assign_to_checkpoint(
	paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
	)
	else:
	resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
	for path in resnet_0_paths:
	old_path = ".".join(["output_blocks", str(i), path["old"]])
	new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])

	new_checkpoint[new_path] = unet_state_dict[old_path]

	return new_checkpoint

	def convert_ldm_clip_checkpoint(checkpoint):
	from transformers import CLIPTextModel
	text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

	keys = list(checkpoint.keys())
	keys.remove("cond_stage_model.transformer.text_model.embeddings.position_ids")

	text_model_dict = {}

	for key in keys:
	if key.startswith("cond_stage_model.transformer"):
	text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
	text_model.load_state_dict(text_model_dict)

	return text_model

	def convert_ldm_clip_text_model(text_model, checkpoint):
	keys = list(checkpoint.keys())
	keys.remove("cond_stage_model.transformer.text_model.embeddings.position_ids")

	text_model_dict = {}

	for key in keys:
	if key.startswith("cond_stage_model.transformer"):
	text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
	text_model.load_state_dict(text_model_dict)

	return text_model

	def conv_attn_to_linear(checkpoint):
	keys = list(checkpoint.keys())
	attn_keys = ["query.weight", "key.weight", "value.weight"]
	for key in keys:
	if ".".join(key.split(".")[-2:]) in attn_keys:
	if checkpoint[key].ndim > 2:
	checkpoint[key] = checkpoint[key][:, :, 0, 0]
	elif "proj_attn.weight" in key:
	if checkpoint[key].ndim > 2:
	checkpoint[key] = checkpoint[key][:, :, 0]

	def convert_ldm_vae_checkpoint(checkpoint, config):
	# extract state dict for VAE
	vae_state_dict = {}
	vae_key = "first_stage_model."
	keys = list(checkpoint.keys())
	for key in keys:
	if key.startswith(vae_key):
	vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)

	new_checkpoint = {}

	new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
	new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
	new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
	new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
	new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
	new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]

	new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
	new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
	new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
	new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
	new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
	new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]

	new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
	new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
	new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
	new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]

	# Retrieves the keys for the encoder down blocks only
	num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
	down_blocks = {
	layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
	}

	# Retrieves the keys for the decoder up blocks only
	num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
	up_blocks = {
	layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
	}

	for i in range(num_down_blocks):
	resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]

	if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
	new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
	f"encoder.down.{i}.downsample.conv.weight"
	)
	new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
	f"encoder.down.{i}.downsample.conv.bias"
	)

	paths = renew_vae_resnet_paths(resnets)
	meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

	mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
	num_mid_res_blocks = 2
	for i in range(1, num_mid_res_blocks + 1):
	resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]

	paths = renew_vae_resnet_paths(resnets)
	meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

	mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
	paths = renew_vae_attention_paths(mid_attentions)
	meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
	conv_attn_to_linear(new_checkpoint)

	for i in range(num_up_blocks):
	block_id = num_up_blocks - 1 - i
	resnets = [
	key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
	]

	if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
	new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
	f"decoder.up.{block_id}.upsample.conv.weight"
	]
	new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
	f"decoder.up.{block_id}.upsample.conv.bias"
	]

	paths = renew_vae_resnet_paths(resnets)
	meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

	mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
	num_mid_res_blocks = 2
	for i in range(1, num_mid_res_blocks + 1):
	resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]

	paths = renew_vae_resnet_paths(resnets)
	meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

	mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
	paths = renew_vae_attention_paths(mid_attentions)
	meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
	assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
	conv_attn_to_linear(new_checkpoint)

	return new_checkpoint