{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"ae1fb4b51ee2457998f8066635edcc14": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_347c9d512b454f4781f0dbbfda6c3724",
"IPY_MODEL_a458afb6c9b94046bc552a50ada0d867",
"IPY_MODEL_f9062f94c8ee41febe6717fcbcb5053f"
],
"layout": "IPY_MODEL_a79cec942e034086927cfda8e9d8afca"
}
},
"347c9d512b454f4781f0dbbfda6c3724": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_f30a519a167249b285fa7cf44e5565bc",
"placeholder": "",
"style": "IPY_MODEL_dba40b83726b4fdaa5d5c8cb1c4d3c3b",
"value": "Loading checkpoint shards: 100%"
}
},
"a458afb6c9b94046bc552a50ada0d867": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_65348e69ef7440038f107f5e8efa3b7e",
"max": 22,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_ddde5ed5da554643aaea09e0bb0b797a",
"value": 22
}
},
"f9062f94c8ee41febe6717fcbcb5053f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c138ccb796ff4c1a81ccce900bdec068",
"placeholder": "",
"style": "IPY_MODEL_558ca035522c405fba9c7a3f9e8eb135",
"value": " 22/22 [00:12<00:00, 1.64it/s]"
}
},
"a79cec942e034086927cfda8e9d8afca": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f30a519a167249b285fa7cf44e5565bc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"dba40b83726b4fdaa5d5c8cb1c4d3c3b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"65348e69ef7440038f107f5e8efa3b7e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ddde5ed5da554643aaea09e0bb0b797a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"c138ccb796ff4c1a81ccce900bdec068": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"558ca035522c405fba9c7a3f9e8eb135": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "code",
"source": [
"model_name_or_path = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T\" #@param {type:\"string\"}\n",
"model_name = model_name_or_path.split(\"/\")[-1]\n",
"\n",
"save_mistral_dir = \"/content/tiny_mistral\" #@param {type:\"string\"}\n",
"\n",
"mixtral_num_experts = 8 #@param {type:\"integer\"}\n",
"save_mixtral_dir = \"/content/tiny_mixtral_x\" #@param {type:\"string\"}\n"
],
"metadata": {
"cellView": "form",
"id": "IS9mKmQQEHbC"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nO9OwwtND6bp"
},
"outputs": [],
"source": [
"!pip install transformers --upgrade\n",
"!pip install torch safetensors"
]
},
{
"cell_type": "code",
"source": [
"!git clone https://huggingface.co./{model_name_or_path}"
],
"metadata": {
"id": "-mUQ35RTEE_G"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import json\n",
"import torch\n",
"\n",
"# load config.json\n",
"with open(f\"{model_name}/config.json\") as f:\n",
" config = json.load(f)\n",
"\n",
"print(config)\n",
"\n",
"mistral_config = {\n",
" \"architectures\": [\n",
" \"MistralForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 4096,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 14336,\n",
" \"max_position_embeddings\": 32768,\n",
" \"model_type\": \"mistral\",\n",
" \"num_attention_heads\": 32,\n",
" \"num_hidden_layers\": 32,\n",
" \"num_key_value_heads\": 8,\n",
" \"rms_norm_eps\": 1e-05,\n",
" \"rope_theta\": 1000000.0,\n",
" \"sliding_window\": None,\n",
" \"tie_word_embeddings\": False,\n",
" # \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.36.0\",\n",
" \"use_cache\": True,\n",
" \"vocab_size\": 32000\n",
"}\n",
"mistral_config[\"architectures\"] = [\"MistralForCausalLM\"]\n",
"mistral_config[\"model_type\"] = \"mistral\"\n",
"mistral_config[\"bos_token_id\"] = config[\"bos_token_id\"]\n",
"mistral_config[\"eos_token_id\"] = config[\"eos_token_id\"]\n",
"mistral_config[\"hidden_act\"] = config[\"hidden_act\"]\n",
"mistral_config[\"hidden_size\"] = config[\"hidden_size\"]\n",
"mistral_config[\"initializer_range\"] = config[\"initializer_range\"]\n",
"mistral_config[\"intermediate_size\"] = config[\"intermediate_size\"]\n",
"mistral_config[\"max_position_embeddings\"] = config[\"max_position_embeddings\"]\n",
"mistral_config[\"num_attention_heads\"] = config[\"num_attention_heads\"]\n",
"mistral_config[\"num_hidden_layers\"] = config[\"num_hidden_layers\"]\n",
"mistral_config[\"num_key_value_heads\"] = config[\"num_key_value_heads\"]\n",
"mistral_config[\"rms_norm_eps\"] = config[\"rms_norm_eps\"]\n",
"mistral_config[\"rope_theta\"] = 1000000.0\n",
"mistral_config[\"sliding_window\"] = None\n",
"mistral_config[\"tie_word_embeddings\"] = config[\"tie_word_embeddings\"]\n",
"mistral_config[\"torch_dtype\"] = config[\"torch_dtype\"]\n",
"mistral_config[\"transformers_version\"] = \"4.36.0\"\n",
"mistral_config[\"use_cache\"] = config[\"use_cache\"]\n",
"mistral_config[\"vocab_size\"] = config[\"vocab_size\"]\n",
"\n",
"# save tokenizer and model\n",
"from transformers import AutoTokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
"tokenizer.save_pretrained(save_mistral_dir)\n",
"\n",
"from transformers import AutoModelForCausalLM\n",
"model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n",
"if model.dtype == torch.float32:\n",
" model.half()\n",
" model.to(torch.bfloat16)\n",
" mistral_config[\"torch_dtype\"] = \"bfloat16\"\n",
"\n",
"model.save_pretrained(save_mistral_dir)\n",
"\n",
"# save convert mistral config\n",
"with open(f\"{save_mistral_dir}/config.json\", \"w\") as f:\n",
" json.dump(mistral_config, f, indent=2)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ITP1ylgIEaUm",
"outputId": "7d5175f6-a686-47b0-ce5a-1c89464c05e5"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'_name_or_path': 'meta-llama/Llama-2-7b-hf', 'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 5632, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 32, 'num_hidden_layers': 22, 'num_key_value_heads': 4, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'use_cache': True, 'vocab_size': 32000}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#\n",
"# モデルの出力テスト\n",
"#\n",
"from transformers import AutoModelForCausalLM, MistralForCausalLM, MixtralForCausalLM\n",
"def test_gen(model_name_or_path):\n",
"\n",
" device = \"cpu\" # ここを変えてね\n",
"\n",
" model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n",
"\n",
" print(\"check model load \")\n",
" print(model.config)\n",
" print(model)\n",
"\n",
" print(\"check model generate text\")\n",
" messages = [\n",
" {\"role\": \"user\", \"content\": \"What is your favourite condiment?\"},\n",
" {\"role\": \"assistant\", \"content\": \"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!\"},\n",
" {\"role\": \"user\", \"content\": \"Do you have mayonnaise recipes?\"}\n",
" ]\n",
"\n",
" encodeds = tokenizer.apply_chat_template(messages, return_tensors=\"pt\")\n",
"\n",
" model_inputs = encodeds.to(device)\n",
" model.to(device)\n",
"\n",
" generated_ids = model.generate(model_inputs, max_new_tokens=128, do_sample=True)\n",
" decoded = tokenizer.batch_decode(generated_ids)\n",
" print(decoded[0])\n",
" print(\"------------------------\")\n",
" return model, tokenizer\n",
"\n",
"_ , _ = test_gen(save_mistral_dir)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zx_NM0wEHjmU",
"outputId": "13b64987-2079-44cd-c707-13a4fe77d474"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"check model load \n",
"MistralConfig {\n",
" \"_name_or_path\": \"/content/tiny_mistral\",\n",
" \"architectures\": [\n",
" \"MistralForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 2048,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 5632,\n",
" \"max_position_embeddings\": 2048,\n",
" \"model_type\": \"mistral\",\n",
" \"num_attention_heads\": 32,\n",
" \"num_hidden_layers\": 22,\n",
" \"num_key_value_heads\": 4,\n",
" \"rms_norm_eps\": 1e-05,\n",
" \"rope_theta\": 1000000.0,\n",
" \"sliding_window\": null,\n",
" \"tie_word_embeddings\": false,\n",
" \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.36.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32000\n",
"}\n",
"\n",
"MistralForCausalLM(\n",
" (model): MistralModel(\n",
" (embed_tokens): Embedding(32000, 2048)\n",
" (layers): ModuleList(\n",
" (0-21): 22 x MistralDecoderLayer(\n",
" (self_attn): MistralAttention(\n",
" (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
" (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
" (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
" (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
" (rotary_emb): MistralRotaryEmbedding()\n",
" )\n",
" (mlp): MistralMLP(\n",
" (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
" (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
" (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" (input_layernorm): MistralRMSNorm()\n",
" (post_attention_layernorm): MistralRMSNorm()\n",
" )\n",
" )\n",
" (norm): MistralRMSNorm()\n",
" )\n",
" (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
")\n",
"check model generate text\n",
" [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] ᴍ [/INST] We are about to test a Mayonnaise recipe. ꧁[INST] It's really good. ᴍ꧁ [INST] Do you know how to make one? [/INST] I've eaten many on my recent days. But, I didn't know any recipe.\n",
"[INST] Not here. But, I have tested and I am going to try this recipes sometime. I am so excited!\n",
"ᴍ That is very useful for me. I'd have love to try.���\n",
"------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#\n",
"# mixtral config setting\n",
"#\n",
"\n",
"mixtral_config = {\n",
" \"architectures\": [\n",
" \"MixtralForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 4096,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 14336,\n",
" \"max_position_embeddings\": 32768,\n",
" \"model_type\": \"mixtral\",\n",
" \"num_attention_heads\": 32,\n",
" \"num_experts_per_tok\": 2,\n",
" \"num_hidden_layers\": 32,\n",
" \"num_key_value_heads\": 8,\n",
" \"num_local_experts\": 8,\n",
" \"output_router_logits\": False,\n",
" \"rms_norm_eps\": 1e-05,\n",
" \"rope_theta\": 1000000.0,\n",
" \"router_aux_loss_coef\": 0.02,\n",
" \"sliding_window\": None,\n",
" \"tie_word_embeddings\": False,\n",
" \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.36.0.dev0\",\n",
" \"use_cache\": True,\n",
" \"vocab_size\": 32000\n",
"}\n",
"\n",
"mixtral_config[\"architectures\"] = [\"MixtralForCausalLM\"]\n",
"mixtral_config[\"model_type\"] = \"mixtral\"\n",
"mixtral_config[\"num_experts_per_tok\"] = 2\n",
"mixtral_config[\"num_local_experts\"] = mixtral_num_experts\n",
"\n",
"mixtral_config[\"bos_token_id\"] = mistral_config[\"bos_token_id\"]\n",
"mixtral_config[\"eos_token_id\"] = mistral_config[\"eos_token_id\"]\n",
"mixtral_config[\"hidden_act\"] = mistral_config[\"hidden_act\"]\n",
"mixtral_config[\"hidden_size\"] = mistral_config[\"hidden_size\"]\n",
"mixtral_config[\"initializer_range\"] = mistral_config[\"initializer_range\"]\n",
"mixtral_config[\"intermediate_size\"] = mistral_config[\"intermediate_size\"]\n",
"mixtral_config[\"max_position_embeddings\"] = mistral_config[\"max_position_embeddings\"]\n",
"mixtral_config[\"num_attention_heads\"] = mistral_config[\"num_attention_heads\"]\n",
"mixtral_config[\"num_hidden_layers\"] = mistral_config[\"num_hidden_layers\"]\n",
"mixtral_config[\"num_key_value_heads\"] = mistral_config[\"num_key_value_heads\"]\n",
"mixtral_config[\"rms_norm_eps\"] = mistral_config[\"rms_norm_eps\"]\n",
"mixtral_config[\"rope_theta\"] = mistral_config[\"rope_theta\"]\n",
"mixtral_config[\"sliding_window\"] = mistral_config[\"sliding_window\"]\n",
"mixtral_config[\"tie_word_embeddings\"] = mistral_config[\"tie_word_embeddings\"]\n",
"mixtral_config[\"torch_dtype\"] = mistral_config[\"torch_dtype\"]\n",
"mixtral_config[\"transformers_version\"] = \"4.36.0.dev0\"\n",
"mixtral_config[\"use_cache\"] = mistral_config[\"use_cache\"]\n",
"mixtral_config[\"vocab_size\"] = mistral_config[\"vocab_size\"]\n",
"\n",
"print(json.dumps(mixtral_config,indent=2))\n",
"\n",
"# configをsave\n",
"!mkdir -p {save_mixtral_dir}\n",
"with open(f\"{save_mixtral_dir}/config.json\", \"w\") as f:\n",
" json.dump(mixtral_config, f, indent=2)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zipdtc3AIWYD",
"outputId": "222d5380-7228-4412-8684-cf6d1c851e74"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{\n",
" \"architectures\": [\n",
" \"MixtralForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 2048,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 5632,\n",
" \"max_position_embeddings\": 2048,\n",
" \"model_type\": \"mixtral\",\n",
" \"num_attention_heads\": 32,\n",
" \"num_experts_per_tok\": 2,\n",
" \"num_hidden_layers\": 22,\n",
" \"num_key_value_heads\": 4,\n",
" \"num_local_experts\": 8,\n",
" \"output_router_logits\": false,\n",
" \"rms_norm_eps\": 1e-05,\n",
" \"rope_theta\": 1000000.0,\n",
" \"router_aux_loss_coef\": 0.02,\n",
" \"sliding_window\": null,\n",
" \"tie_word_embeddings\": false,\n",
" \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.36.0.dev0\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32000\n",
"}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# copy other model files\n",
"\n",
"# save tokenizer\n",
"if tokenizer is None:\n",
" from transformers import AutoTokenizer\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
"\n",
"tokenizer.save_pretrained(save_mixtral_dir)\n",
"\n",
"!cp {save_mistral_dir}/generation_config.json {save_mixtral_dir}/generation_config.json\n"
],
"metadata": {
"id": "T2uTzZHyk6vS"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# convert mixtral clone\n",
"import torch\n",
"from safetensors import safe_open\n",
"from safetensors.torch import save_file\n",
"import re\n",
"\n",
"def convert_weight_name(mistral_key, mixtral_expert_num):\n",
" if \"mlp.gate_proj\" in mistral_key:\n",
" return mistral_key.replace(\".mlp.gate_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w1.\")\n",
" elif \"mlp.down_proj\" in mistral_key:\n",
" return mistral_key.replace(\".mlp.down_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w2.\")\n",
" elif \"mlp.up_proj\" in mistral_key:\n",
" return mistral_key.replace(\".mlp.up_proj.\" , f\".block_sparse_moe.experts.{mixtral_expert_num}.w3.\")\n",
" else:\n",
" return mistral_key\n",
"\n",
"def is_experts_key(mistral_key):\n",
" return \".mlp.\" in mistral_key\n",
"\n",
"def get_layer(mistral_key):\n",
" layer = re.match(r'model[.]layers[.]\\d+[.]', mistral_key)\n",
" if layer is not None:\n",
" return int(re.findall(r'\\d+', layer[0])[0])\n",
" return None\n",
"\n",
"def get_weight_byte_size(weight):\n",
"\n",
" if isinstance(weight, torch.Tensor):\n",
" weight_byte_size = weight.nelement() * weight.element_size()\n",
" else:\n",
" weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n",
"\n",
" return weight_byte_size\n",
"\n",
"# mistralのweight取得\n",
"mistral_weights = safe_open(save_mistral_dir + \"/model.safetensors\", framework=\"pt\")\n",
"# print(mistral_weights.keys())\n",
"\n",
"first_weights = {}\n",
"\n",
"gate_shape = mistral_weights.get_tensor(\"model.layers.0.mlp.up_proj.weight\").shape\n",
"gate_tensor = torch.full((mixtral_num_experts, gate_shape[1]), 0.5)\n",
"\n",
"common_layer_weights = {}\n",
"\n",
"print(\"mixtral_num_experts\", mixtral_num_experts, \"gate_shape[1]\", gate_shape[1], \"gate_tensor\", gate_tensor)\n",
"\n",
"# max layer\n",
"max_layer_no = 0\n",
"for key in mistral_weights.keys():\n",
" layer_no = get_layer(key)\n",
" if layer_no is None:\n",
" first_weights[key] = mistral_weights.get_tensor(key)\n",
" else:\n",
" max_layer_no = max(max_layer_no, layer_no)\n",
"\n",
"mixtral_weight_map = {\n",
" \"metadata\": {\n",
" \"total_size\": 0\n",
" },\n",
" \"weight_map\": {\n",
" }\n",
"}\n",
"\n",
"total_size = 0\n",
"\n",
"!rm {save_mixtral_dir + \"/*.safetensors\"}\n",
"\n",
"for i in range(max_layer_no + 1):\n",
" weight_file_no = i + 1\n",
" layer_weights = {}\n",
"\n",
" # first weight\n",
" if weight_file_no == 1:\n",
" for key in first_weights.keys():\n",
" mixtral_key = convert_weight_name(key, 0)\n",
" layer_weights[mixtral_key] = first_weights[mixtral_key]\n",
" total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
" print(\"first\", mixtral_key, layer_weights[mixtral_key].shape)\n",
"\n",
"\n",
" for key in mistral_weights.keys():\n",
"\n",
" lk = re.match(re.compile(f\"model[.]layers[.]{i}[.]\"), key)\n",
" if lk is not None:\n",
" mistral_layer_key = key\n",
" if not is_experts_key(mistral_layer_key):\n",
" mixtral_key = convert_weight_name(mistral_layer_key, 0)\n",
" layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key)\n",
" total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
" print(\"layer\", i , mixtral_key, layer_weights[mixtral_key].shape)\n",
" else:\n",
" print(\"gen experts\")\n",
" for expert_no in range(mixtral_num_experts):\n",
" mixtral_key = convert_weight_name(mistral_layer_key, expert_no)\n",
" layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key).clone()\n",
" total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
" print(\"layer\", i , \"expert\", expert_no, mixtral_key, layer_weights[mixtral_key].shape)\n",
"\n",
" # gate\n",
" mixtral_key = f\"model.layers.{i}.block_sparse_moe.gate.weight\"\n",
" layer_weights[mixtral_key] = gate_tensor.clone()\n",
" total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
" print(\"layer\", i , \"gate\", mixtral_key, layer_weights[mixtral_key].shape)\n",
"\n",
" #フォーマットで0埋め\n",
" tensor_weight_file_name = f\"model.layers.{weight_file_no:05d}-of-{max_layer_no + 1:05d}.safetensors\"\n",
"\n",
" # save safetensor\n",
" save_file(layer_weights, save_mixtral_dir + \"/\" + tensor_weight_file_name, metadata={\"format\":\"pt\"})\n",
" print(\"Save layer weighs\", i, tensor_weight_file_name)\n",
"\n",
" for key in layer_weights.keys():\n",
" mixtral_weight_map[\"weight_map\"][key] = tensor_weight_file_name\n",
"\n",
" print(i, tensor_weight_file_name)\n",
"\n",
"# set total size\n",
"mixtral_weight_map[\"metadata\"][\"total_size\"] = total_size\n",
"\n",
"# save model.safetensors.index.json\n",
"with open(save_mixtral_dir + \"/model.safetensors.index.json\", \"w\") as f:\n",
" json.dump(mixtral_weight_map, f, indent=2)\n",
"\n",
"print(mixtral_weight_map)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Lswg1ESdI7q9",
"outputId": "170e9c80-7856-4a64-b9d6-2701c650ed68"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"mixtral_num_experts 8 gate_shape[1] 2048 gate_tensor tensor([[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
" [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
" [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
" ...,\n",
" [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
" [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
" [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]])\n",
"first lm_head.weight torch.Size([32000, 2048])\n",
"first model.embed_tokens.weight torch.Size([32000, 2048])\n",
"first model.norm.weight torch.Size([2048])\n",
"layer 0 model.layers.0.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 0 model.layers.0.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 0 model.layers.0.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 0 model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 0 model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 0 model.layers.0.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 0 gate model.layers.0.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 0 model.layers.00001-of-00022.safetensors\n",
"0 model.layers.00001-of-00022.safetensors\n",
"layer 1 model.layers.1.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 1 model.layers.1.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 1 model.layers.1.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 1 model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 1 model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 1 model.layers.1.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 1 gate model.layers.1.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 1 model.layers.00002-of-00022.safetensors\n",
"1 model.layers.00002-of-00022.safetensors\n",
"layer 2 model.layers.2.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 2 model.layers.2.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 2 model.layers.2.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 2 model.layers.2.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 2 model.layers.2.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 2 model.layers.2.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 2 gate model.layers.2.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 2 model.layers.00003-of-00022.safetensors\n",
"2 model.layers.00003-of-00022.safetensors\n",
"layer 3 model.layers.3.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 3 model.layers.3.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 3 model.layers.3.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 3 model.layers.3.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 3 model.layers.3.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 3 model.layers.3.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 3 gate model.layers.3.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 3 model.layers.00004-of-00022.safetensors\n",
"3 model.layers.00004-of-00022.safetensors\n",
"layer 4 model.layers.4.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 4 model.layers.4.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 4 model.layers.4.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 4 model.layers.4.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 4 model.layers.4.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 4 model.layers.4.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 4 gate model.layers.4.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 4 model.layers.00005-of-00022.safetensors\n",
"4 model.layers.00005-of-00022.safetensors\n",
"layer 5 model.layers.5.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 5 model.layers.5.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 5 model.layers.5.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 5 model.layers.5.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 5 model.layers.5.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 5 model.layers.5.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 5 gate model.layers.5.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 5 model.layers.00006-of-00022.safetensors\n",
"5 model.layers.00006-of-00022.safetensors\n",
"layer 6 model.layers.6.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 6 model.layers.6.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 6 model.layers.6.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 6 model.layers.6.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 6 model.layers.6.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 6 model.layers.6.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 6 gate model.layers.6.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 6 model.layers.00007-of-00022.safetensors\n",
"6 model.layers.00007-of-00022.safetensors\n",
"layer 7 model.layers.7.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 7 model.layers.7.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 7 model.layers.7.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 7 model.layers.7.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 7 model.layers.7.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 7 model.layers.7.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 7 gate model.layers.7.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 7 model.layers.00008-of-00022.safetensors\n",
"7 model.layers.00008-of-00022.safetensors\n",
"layer 8 model.layers.8.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 8 model.layers.8.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 8 model.layers.8.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 8 model.layers.8.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 8 model.layers.8.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 8 model.layers.8.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 8 gate model.layers.8.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 8 model.layers.00009-of-00022.safetensors\n",
"8 model.layers.00009-of-00022.safetensors\n",
"layer 9 model.layers.9.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 9 model.layers.9.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 9 model.layers.9.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 9 model.layers.9.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 9 model.layers.9.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 9 model.layers.9.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 9 gate model.layers.9.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 9 model.layers.00010-of-00022.safetensors\n",
"9 model.layers.00010-of-00022.safetensors\n",
"layer 10 model.layers.10.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 10 model.layers.10.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 10 model.layers.10.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 10 model.layers.10.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 10 model.layers.10.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 10 model.layers.10.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 10 gate model.layers.10.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 10 model.layers.00011-of-00022.safetensors\n",
"10 model.layers.00011-of-00022.safetensors\n",
"layer 11 model.layers.11.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 11 model.layers.11.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 11 model.layers.11.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 11 model.layers.11.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 11 model.layers.11.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 11 model.layers.11.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 11 gate model.layers.11.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 11 model.layers.00012-of-00022.safetensors\n",
"11 model.layers.00012-of-00022.safetensors\n",
"layer 12 model.layers.12.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 12 model.layers.12.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 12 model.layers.12.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 12 model.layers.12.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 12 model.layers.12.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 12 model.layers.12.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 12 gate model.layers.12.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 12 model.layers.00013-of-00022.safetensors\n",
"12 model.layers.00013-of-00022.safetensors\n",
"layer 13 model.layers.13.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 13 model.layers.13.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 13 model.layers.13.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 13 model.layers.13.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 13 model.layers.13.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 13 model.layers.13.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 13 gate model.layers.13.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 13 model.layers.00014-of-00022.safetensors\n",
"13 model.layers.00014-of-00022.safetensors\n",
"layer 14 model.layers.14.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 14 model.layers.14.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 14 model.layers.14.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 14 model.layers.14.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 14 model.layers.14.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 14 model.layers.14.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 14 gate model.layers.14.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 14 model.layers.00015-of-00022.safetensors\n",
"14 model.layers.00015-of-00022.safetensors\n",
"layer 15 model.layers.15.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 15 model.layers.15.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 15 model.layers.15.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 15 model.layers.15.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 15 model.layers.15.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 15 model.layers.15.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 15 gate model.layers.15.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 15 model.layers.00016-of-00022.safetensors\n",
"15 model.layers.00016-of-00022.safetensors\n",
"layer 16 model.layers.16.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 16 model.layers.16.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 16 model.layers.16.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 16 model.layers.16.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 16 model.layers.16.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 16 model.layers.16.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 16 gate model.layers.16.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 16 model.layers.00017-of-00022.safetensors\n",
"16 model.layers.00017-of-00022.safetensors\n",
"layer 17 model.layers.17.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 17 model.layers.17.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 17 model.layers.17.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 17 model.layers.17.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 17 model.layers.17.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 17 model.layers.17.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 17 gate model.layers.17.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 17 model.layers.00018-of-00022.safetensors\n",
"17 model.layers.00018-of-00022.safetensors\n",
"layer 18 model.layers.18.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 18 model.layers.18.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 18 model.layers.18.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 18 model.layers.18.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 18 model.layers.18.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 18 model.layers.18.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 18 gate model.layers.18.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 18 model.layers.00019-of-00022.safetensors\n",
"18 model.layers.00019-of-00022.safetensors\n",
"layer 19 model.layers.19.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 19 model.layers.19.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 19 model.layers.19.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 19 model.layers.19.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 19 model.layers.19.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 19 model.layers.19.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 19 gate model.layers.19.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 19 model.layers.00020-of-00022.safetensors\n",
"19 model.layers.00020-of-00022.safetensors\n",
"layer 20 model.layers.20.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 20 model.layers.20.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 20 model.layers.20.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 20 model.layers.20.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 20 model.layers.20.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 20 model.layers.20.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 20 gate model.layers.20.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 20 model.layers.00021-of-00022.safetensors\n",
"20 model.layers.00021-of-00022.safetensors\n",
"layer 21 model.layers.21.input_layernorm.weight torch.Size([2048])\n",
"gen experts\n",
"layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
"layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
"gen experts\n",
"layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
"layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
"gen experts\n",
"layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
"layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
"layer 21 model.layers.21.post_attention_layernorm.weight torch.Size([2048])\n",
"layer 21 model.layers.21.self_attn.k_proj.weight torch.Size([256, 2048])\n",
"layer 21 model.layers.21.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
"layer 21 model.layers.21.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
"layer 21 model.layers.21.self_attn.v_proj.weight torch.Size([256, 2048])\n",
"layer 21 gate model.layers.21.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
"Save layer weighs 21 model.layers.00022-of-00022.safetensors\n",
"21 model.layers.00022-of-00022.safetensors\n",
"{'metadata': {'total_size': 12859265024}, 'weight_map': {'lm_head.weight': 'model.layers.00001-of-00022.safetensors', 'model.embed_tokens.weight': 'model.layers.00001-of-00022.safetensors', 'model.norm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.input_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.post_attention_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.k_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.o_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.q_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.v_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.gate.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.1.input_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.post_attention_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.k_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.o_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.q_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.v_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.gate.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.2.input_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.post_attention_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.k_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.o_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.q_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.v_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.gate.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.3.input_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.post_attention_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.k_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.o_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.q_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.v_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.gate.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.4.input_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.post_attention_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.k_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.o_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.q_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.v_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.gate.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.5.input_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.post_attention_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.k_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.o_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.q_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.v_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.gate.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.6.input_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.post_attention_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.k_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.o_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.q_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.v_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.gate.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.7.input_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.post_attention_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.k_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.o_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.q_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.v_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.gate.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.8.input_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.post_attention_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.k_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.o_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.q_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.v_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.gate.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.9.input_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.post_attention_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.k_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.o_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.q_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.v_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.gate.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.10.input_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.post_attention_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.k_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.o_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.q_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.v_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.gate.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.11.input_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.post_attention_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.k_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.o_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.q_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.v_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.gate.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.12.input_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.post_attention_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.k_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.o_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.q_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.v_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.gate.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.13.input_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.post_attention_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.k_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.o_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.q_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.v_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.gate.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.14.input_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.post_attention_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.k_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.o_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.q_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.v_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.gate.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.15.input_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.post_attention_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.k_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.o_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.q_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.v_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.gate.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.16.input_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.post_attention_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.k_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.o_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.q_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.v_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.gate.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.17.input_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.post_attention_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.k_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.o_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.q_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.v_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.gate.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.18.input_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.post_attention_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.k_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.o_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.q_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.v_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.gate.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.19.input_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.post_attention_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.k_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.o_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.q_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.v_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.gate.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.20.input_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.post_attention_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.k_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.o_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.q_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.v_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.gate.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.21.input_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.post_attention_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.k_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.o_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.q_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.v_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.gate.weight': 'model.layers.00022-of-00022.safetensors'}}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# check model\n",
"mx_model, mx_tok = test_gen(save_mixtral_dir)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"ae1fb4b51ee2457998f8066635edcc14",
"347c9d512b454f4781f0dbbfda6c3724",
"a458afb6c9b94046bc552a50ada0d867",
"f9062f94c8ee41febe6717fcbcb5053f",
"a79cec942e034086927cfda8e9d8afca",
"f30a519a167249b285fa7cf44e5565bc",
"dba40b83726b4fdaa5d5c8cb1c4d3c3b",
"65348e69ef7440038f107f5e8efa3b7e",
"ddde5ed5da554643aaea09e0bb0b797a",
"c138ccb796ff4c1a81ccce900bdec068",
"558ca035522c405fba9c7a3f9e8eb135"
]
},
"id": "DCs_uVxCvCwR",
"outputId": "73663da6-0141-48fa-936b-82598865d27a"
},
"execution_count": 9,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Loading checkpoint shards: 0%| | 0/22 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "ae1fb4b51ee2457998f8066635edcc14"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"check model load \n",
"MixtralConfig {\n",
" \"_name_or_path\": \"/content/tiny_mixtral_x\",\n",
" \"architectures\": [\n",
" \"MixtralForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 2048,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 5632,\n",
" \"max_position_embeddings\": 2048,\n",
" \"model_type\": \"mixtral\",\n",
" \"num_attention_heads\": 32,\n",
" \"num_experts_per_tok\": 2,\n",
" \"num_hidden_layers\": 22,\n",
" \"num_key_value_heads\": 4,\n",
" \"num_local_experts\": 8,\n",
" \"output_router_logits\": false,\n",
" \"rms_norm_eps\": 1e-05,\n",
" \"rope_theta\": 1000000.0,\n",
" \"router_aux_loss_coef\": 0.02,\n",
" \"sliding_window\": null,\n",
" \"tie_word_embeddings\": false,\n",
" \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.36.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32000\n",
"}\n",
"\n",
"MixtralForCausalLM(\n",
" (model): MixtralModel(\n",
" (embed_tokens): Embedding(32000, 2048)\n",
" (layers): ModuleList(\n",
" (0-21): 22 x MixtralDecoderLayer(\n",
" (self_attn): MixtralAttention(\n",
" (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
" (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
" (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
" (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
" (rotary_emb): MixtralRotaryEmbedding()\n",
" )\n",
" (block_sparse_moe): MixtralSparseMoeBlock(\n",
" (gate): Linear(in_features=2048, out_features=8, bias=False)\n",
" (experts): ModuleList(\n",
" (0-7): 8 x MixtralBLockSparseTop2MLP(\n",
" (w1): Linear(in_features=2048, out_features=5632, bias=False)\n",
" (w2): Linear(in_features=5632, out_features=2048, bias=False)\n",
" (w3): Linear(in_features=2048, out_features=5632, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" )\n",
" )\n",
" (input_layernorm): MixtralRMSNorm()\n",
" (post_attention_layernorm): MixtralRMSNorm()\n",
" )\n",
" )\n",
" (norm): MixtralRMSNorm()\n",
" )\n",
" (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
")\n",
"check model generate text\n",
" [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] (https://diet-d.com/recipes/mayonnaise-recipe-recipes-for-chicken-or-fish-on-easy-mayonaise-veggie-eggs-dont-use-sweetness.html) are you thinking of substituting these with a recipe that calls for mayonnaise, though?\n",
"cheesecake recipes with a mayonnaise instead of the oil? For how many pounds? May I suggest you substitute the mayonnaise with a cream? (and olive oil instead of soybean oil?\n",
"------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# from google.colab import userdata\n",
"# !huggingface-cli login --token {userdata.get('HUGGINGFACE_ACCESS_TOKEN')}"
],
"metadata": {
"id": "X1jZZ3ggwX9x"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# huggingface_repo = \"TinyMixtral-x8-Clonebase-7b\"\n",
"# mx_model.push_to_hub(huggingface_repo, private=True)\n",
"# mx_tok.push_to_hub(huggingface_repo, private=True)"
],
"metadata": {
"id": "asht1d6Fws_P"
},
"execution_count": null,
"outputs": []
}
]
}