{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "ae1fb4b51ee2457998f8066635edcc14": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_347c9d512b454f4781f0dbbfda6c3724", "IPY_MODEL_a458afb6c9b94046bc552a50ada0d867", "IPY_MODEL_f9062f94c8ee41febe6717fcbcb5053f" ], "layout": "IPY_MODEL_a79cec942e034086927cfda8e9d8afca" } }, "347c9d512b454f4781f0dbbfda6c3724": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f30a519a167249b285fa7cf44e5565bc", "placeholder": "​", "style": "IPY_MODEL_dba40b83726b4fdaa5d5c8cb1c4d3c3b", "value": "Loading checkpoint shards: 100%" } }, "a458afb6c9b94046bc552a50ada0d867": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_65348e69ef7440038f107f5e8efa3b7e", "max": 22, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_ddde5ed5da554643aaea09e0bb0b797a", "value": 22 } }, "f9062f94c8ee41febe6717fcbcb5053f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c138ccb796ff4c1a81ccce900bdec068", "placeholder": "​", "style": "IPY_MODEL_558ca035522c405fba9c7a3f9e8eb135", "value": " 22/22 [00:12<00:00, 1.64it/s]" } }, "a79cec942e034086927cfda8e9d8afca": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f30a519a167249b285fa7cf44e5565bc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dba40b83726b4fdaa5d5c8cb1c4d3c3b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "65348e69ef7440038f107f5e8efa3b7e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ddde5ed5da554643aaea09e0bb0b797a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c138ccb796ff4c1a81ccce900bdec068": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "558ca035522c405fba9c7a3f9e8eb135": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "model_name_or_path = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T\" #@param {type:\"string\"}\n", "model_name = model_name_or_path.split(\"/\")[-1]\n", "\n", "save_mistral_dir = \"/content/tiny_mistral\" #@param {type:\"string\"}\n", "\n", "mixtral_num_experts = 8 #@param {type:\"integer\"}\n", "save_mixtral_dir = \"/content/tiny_mixtral_x\" #@param {type:\"string\"}\n" ], "metadata": { "cellView": "form", "id": "IS9mKmQQEHbC" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nO9OwwtND6bp" }, "outputs": [], "source": [ "!pip install transformers --upgrade\n", "!pip install torch safetensors" ] }, { "cell_type": "code", "source": [ "!git clone https://huggingface.co./{model_name_or_path}" ], "metadata": { "id": "-mUQ35RTEE_G" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import json\n", "import torch\n", "\n", "# load config.json\n", "with open(f\"{model_name}/config.json\") as f:\n", " config = json.load(f)\n", "\n", "print(config)\n", "\n", "mistral_config = {\n", " \"architectures\": [\n", " \"MistralForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 4096,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 14336,\n", " \"max_position_embeddings\": 32768,\n", " \"model_type\": \"mistral\",\n", " \"num_attention_heads\": 32,\n", " \"num_hidden_layers\": 32,\n", " \"num_key_value_heads\": 8,\n", " \"rms_norm_eps\": 1e-05,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": None,\n", " \"tie_word_embeddings\": False,\n", " # \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.36.0\",\n", " \"use_cache\": True,\n", " \"vocab_size\": 32000\n", "}\n", "mistral_config[\"architectures\"] = [\"MistralForCausalLM\"]\n", "mistral_config[\"model_type\"] = \"mistral\"\n", "mistral_config[\"bos_token_id\"] = config[\"bos_token_id\"]\n", "mistral_config[\"eos_token_id\"] = config[\"eos_token_id\"]\n", "mistral_config[\"hidden_act\"] = config[\"hidden_act\"]\n", "mistral_config[\"hidden_size\"] = config[\"hidden_size\"]\n", "mistral_config[\"initializer_range\"] = config[\"initializer_range\"]\n", "mistral_config[\"intermediate_size\"] = config[\"intermediate_size\"]\n", "mistral_config[\"max_position_embeddings\"] = config[\"max_position_embeddings\"]\n", "mistral_config[\"num_attention_heads\"] = config[\"num_attention_heads\"]\n", "mistral_config[\"num_hidden_layers\"] = config[\"num_hidden_layers\"]\n", "mistral_config[\"num_key_value_heads\"] = config[\"num_key_value_heads\"]\n", "mistral_config[\"rms_norm_eps\"] = config[\"rms_norm_eps\"]\n", "mistral_config[\"rope_theta\"] = 1000000.0\n", "mistral_config[\"sliding_window\"] = None\n", "mistral_config[\"tie_word_embeddings\"] = config[\"tie_word_embeddings\"]\n", "mistral_config[\"torch_dtype\"] = config[\"torch_dtype\"]\n", "mistral_config[\"transformers_version\"] = \"4.36.0\"\n", "mistral_config[\"use_cache\"] = config[\"use_cache\"]\n", "mistral_config[\"vocab_size\"] = config[\"vocab_size\"]\n", "\n", "# save tokenizer and model\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", "tokenizer.save_pretrained(save_mistral_dir)\n", "\n", "from transformers import AutoModelForCausalLM\n", "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", "if model.dtype == torch.float32:\n", " model.half()\n", " model.to(torch.bfloat16)\n", " mistral_config[\"torch_dtype\"] = \"bfloat16\"\n", "\n", "model.save_pretrained(save_mistral_dir)\n", "\n", "# save convert mistral config\n", "with open(f\"{save_mistral_dir}/config.json\", \"w\") as f:\n", " json.dump(mistral_config, f, indent=2)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ITP1ylgIEaUm", "outputId": "7d5175f6-a686-47b0-ce5a-1c89464c05e5" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'_name_or_path': 'meta-llama/Llama-2-7b-hf', 'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 5632, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 32, 'num_hidden_layers': 22, 'num_key_value_heads': 4, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'use_cache': True, 'vocab_size': 32000}\n" ] } ] }, { "cell_type": "code", "source": [ "#\n", "# モデルの出力テスト\n", "#\n", "from transformers import AutoModelForCausalLM, MistralForCausalLM, MixtralForCausalLM\n", "def test_gen(model_name_or_path):\n", "\n", " device = \"cpu\" # ここを変えてね\n", "\n", " model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", "\n", " print(\"check model load \")\n", " print(model.config)\n", " print(model)\n", "\n", " print(\"check model generate text\")\n", " messages = [\n", " {\"role\": \"user\", \"content\": \"What is your favourite condiment?\"},\n", " {\"role\": \"assistant\", \"content\": \"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!\"},\n", " {\"role\": \"user\", \"content\": \"Do you have mayonnaise recipes?\"}\n", " ]\n", "\n", " encodeds = tokenizer.apply_chat_template(messages, return_tensors=\"pt\")\n", "\n", " model_inputs = encodeds.to(device)\n", " model.to(device)\n", "\n", " generated_ids = model.generate(model_inputs, max_new_tokens=128, do_sample=True)\n", " decoded = tokenizer.batch_decode(generated_ids)\n", " print(decoded[0])\n", " print(\"------------------------\")\n", " return model, tokenizer\n", "\n", "_ , _ = test_gen(save_mistral_dir)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zx_NM0wEHjmU", "outputId": "13b64987-2079-44cd-c707-13a4fe77d474" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "check model load \n", "MistralConfig {\n", " \"_name_or_path\": \"/content/tiny_mistral\",\n", " \"architectures\": [\n", " \"MistralForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 5632,\n", " \"max_position_embeddings\": 2048,\n", " \"model_type\": \"mistral\",\n", " \"num_attention_heads\": 32,\n", " \"num_hidden_layers\": 22,\n", " \"num_key_value_heads\": 4,\n", " \"rms_norm_eps\": 1e-05,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": false,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.36.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 32000\n", "}\n", "\n", "MistralForCausalLM(\n", " (model): MistralModel(\n", " (embed_tokens): Embedding(32000, 2048)\n", " (layers): ModuleList(\n", " (0-21): 22 x MistralDecoderLayer(\n", " (self_attn): MistralAttention(\n", " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n", " (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n", " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (rotary_emb): MistralRotaryEmbedding()\n", " )\n", " (mlp): MistralMLP(\n", " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): MistralRMSNorm()\n", " (post_attention_layernorm): MistralRMSNorm()\n", " )\n", " )\n", " (norm): MistralRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n", ")\n", "check model generate text\n", " [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] ᴍ [/INST] We are about to test a Mayonnaise recipe. ꧁[INST] It's really good. ᴍ꧁ [INST] Do you know how to make one? [/INST] I've eaten many on my recent days. But, I didn't know any recipe.\n", "[INST] Not here. But, I have tested and I am going to try this recipes sometime. I am so excited!\n", "ᴍ That is very useful for me. I'd have love to try.���\n", "------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "#\n", "# mixtral config setting\n", "#\n", "\n", "mixtral_config = {\n", " \"architectures\": [\n", " \"MixtralForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 4096,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 14336,\n", " \"max_position_embeddings\": 32768,\n", " \"model_type\": \"mixtral\",\n", " \"num_attention_heads\": 32,\n", " \"num_experts_per_tok\": 2,\n", " \"num_hidden_layers\": 32,\n", " \"num_key_value_heads\": 8,\n", " \"num_local_experts\": 8,\n", " \"output_router_logits\": False,\n", " \"rms_norm_eps\": 1e-05,\n", " \"rope_theta\": 1000000.0,\n", " \"router_aux_loss_coef\": 0.02,\n", " \"sliding_window\": None,\n", " \"tie_word_embeddings\": False,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.36.0.dev0\",\n", " \"use_cache\": True,\n", " \"vocab_size\": 32000\n", "}\n", "\n", "mixtral_config[\"architectures\"] = [\"MixtralForCausalLM\"]\n", "mixtral_config[\"model_type\"] = \"mixtral\"\n", "mixtral_config[\"num_experts_per_tok\"] = 2\n", "mixtral_config[\"num_local_experts\"] = mixtral_num_experts\n", "\n", "mixtral_config[\"bos_token_id\"] = mistral_config[\"bos_token_id\"]\n", "mixtral_config[\"eos_token_id\"] = mistral_config[\"eos_token_id\"]\n", "mixtral_config[\"hidden_act\"] = mistral_config[\"hidden_act\"]\n", "mixtral_config[\"hidden_size\"] = mistral_config[\"hidden_size\"]\n", "mixtral_config[\"initializer_range\"] = mistral_config[\"initializer_range\"]\n", "mixtral_config[\"intermediate_size\"] = mistral_config[\"intermediate_size\"]\n", "mixtral_config[\"max_position_embeddings\"] = mistral_config[\"max_position_embeddings\"]\n", "mixtral_config[\"num_attention_heads\"] = mistral_config[\"num_attention_heads\"]\n", "mixtral_config[\"num_hidden_layers\"] = mistral_config[\"num_hidden_layers\"]\n", "mixtral_config[\"num_key_value_heads\"] = mistral_config[\"num_key_value_heads\"]\n", "mixtral_config[\"rms_norm_eps\"] = mistral_config[\"rms_norm_eps\"]\n", "mixtral_config[\"rope_theta\"] = mistral_config[\"rope_theta\"]\n", "mixtral_config[\"sliding_window\"] = mistral_config[\"sliding_window\"]\n", "mixtral_config[\"tie_word_embeddings\"] = mistral_config[\"tie_word_embeddings\"]\n", "mixtral_config[\"torch_dtype\"] = mistral_config[\"torch_dtype\"]\n", "mixtral_config[\"transformers_version\"] = \"4.36.0.dev0\"\n", "mixtral_config[\"use_cache\"] = mistral_config[\"use_cache\"]\n", "mixtral_config[\"vocab_size\"] = mistral_config[\"vocab_size\"]\n", "\n", "print(json.dumps(mixtral_config,indent=2))\n", "\n", "# configをsave\n", "!mkdir -p {save_mixtral_dir}\n", "with open(f\"{save_mixtral_dir}/config.json\", \"w\") as f:\n", " json.dump(mixtral_config, f, indent=2)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zipdtc3AIWYD", "outputId": "222d5380-7228-4412-8684-cf6d1c851e74" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{\n", " \"architectures\": [\n", " \"MixtralForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 1,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 5632,\n", " \"max_position_embeddings\": 2048,\n", " \"model_type\": \"mixtral\",\n", " \"num_attention_heads\": 32,\n", " \"num_experts_per_tok\": 2,\n", " \"num_hidden_layers\": 22,\n", " \"num_key_value_heads\": 4,\n", " \"num_local_experts\": 8,\n", " \"output_router_logits\": false,\n", " \"rms_norm_eps\": 1e-05,\n", " \"rope_theta\": 1000000.0,\n", " \"router_aux_loss_coef\": 0.02,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": false,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.36.0.dev0\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 32000\n", "}\n" ] } ] }, { "cell_type": "code", "source": [ "# copy other model files\n", "\n", "# save tokenizer\n", "if tokenizer is None:\n", " from transformers import AutoTokenizer\n", " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", "\n", "tokenizer.save_pretrained(save_mixtral_dir)\n", "\n", "!cp {save_mistral_dir}/generation_config.json {save_mixtral_dir}/generation_config.json\n" ], "metadata": { "id": "T2uTzZHyk6vS" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "# convert mixtral clone\n", "import torch\n", "from safetensors import safe_open\n", "from safetensors.torch import save_file\n", "import re\n", "\n", "def convert_weight_name(mistral_key, mixtral_expert_num):\n", " if \"mlp.gate_proj\" in mistral_key:\n", " return mistral_key.replace(\".mlp.gate_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w1.\")\n", " elif \"mlp.down_proj\" in mistral_key:\n", " return mistral_key.replace(\".mlp.down_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w2.\")\n", " elif \"mlp.up_proj\" in mistral_key:\n", " return mistral_key.replace(\".mlp.up_proj.\" , f\".block_sparse_moe.experts.{mixtral_expert_num}.w3.\")\n", " else:\n", " return mistral_key\n", "\n", "def is_experts_key(mistral_key):\n", " return \".mlp.\" in mistral_key\n", "\n", "def get_layer(mistral_key):\n", " layer = re.match(r'model[.]layers[.]\\d+[.]', mistral_key)\n", " if layer is not None:\n", " return int(re.findall(r'\\d+', layer[0])[0])\n", " return None\n", "\n", "def get_weight_byte_size(weight):\n", "\n", " if isinstance(weight, torch.Tensor):\n", " weight_byte_size = weight.nelement() * weight.element_size()\n", " else:\n", " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", "\n", " return weight_byte_size\n", "\n", "# mistralのweight取得\n", "mistral_weights = safe_open(save_mistral_dir + \"/model.safetensors\", framework=\"pt\")\n", "# print(mistral_weights.keys())\n", "\n", "first_weights = {}\n", "\n", "gate_shape = mistral_weights.get_tensor(\"model.layers.0.mlp.up_proj.weight\").shape\n", "gate_tensor = torch.full((mixtral_num_experts, gate_shape[1]), 0.5)\n", "\n", "common_layer_weights = {}\n", "\n", "print(\"mixtral_num_experts\", mixtral_num_experts, \"gate_shape[1]\", gate_shape[1], \"gate_tensor\", gate_tensor)\n", "\n", "# max layer\n", "max_layer_no = 0\n", "for key in mistral_weights.keys():\n", " layer_no = get_layer(key)\n", " if layer_no is None:\n", " first_weights[key] = mistral_weights.get_tensor(key)\n", " else:\n", " max_layer_no = max(max_layer_no, layer_no)\n", "\n", "mixtral_weight_map = {\n", " \"metadata\": {\n", " \"total_size\": 0\n", " },\n", " \"weight_map\": {\n", " }\n", "}\n", "\n", "total_size = 0\n", "\n", "!rm {save_mixtral_dir + \"/*.safetensors\"}\n", "\n", "for i in range(max_layer_no + 1):\n", " weight_file_no = i + 1\n", " layer_weights = {}\n", "\n", " # first weight\n", " if weight_file_no == 1:\n", " for key in first_weights.keys():\n", " mixtral_key = convert_weight_name(key, 0)\n", " layer_weights[mixtral_key] = first_weights[mixtral_key]\n", " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", " print(\"first\", mixtral_key, layer_weights[mixtral_key].shape)\n", "\n", "\n", " for key in mistral_weights.keys():\n", "\n", " lk = re.match(re.compile(f\"model[.]layers[.]{i}[.]\"), key)\n", " if lk is not None:\n", " mistral_layer_key = key\n", " if not is_experts_key(mistral_layer_key):\n", " mixtral_key = convert_weight_name(mistral_layer_key, 0)\n", " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key)\n", " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", " print(\"layer\", i , mixtral_key, layer_weights[mixtral_key].shape)\n", " else:\n", " print(\"gen experts\")\n", " for expert_no in range(mixtral_num_experts):\n", " mixtral_key = convert_weight_name(mistral_layer_key, expert_no)\n", " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key).clone()\n", " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", " print(\"layer\", i , \"expert\", expert_no, mixtral_key, layer_weights[mixtral_key].shape)\n", "\n", " # gate\n", " mixtral_key = f\"model.layers.{i}.block_sparse_moe.gate.weight\"\n", " layer_weights[mixtral_key] = gate_tensor.clone()\n", " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", " print(\"layer\", i , \"gate\", mixtral_key, layer_weights[mixtral_key].shape)\n", "\n", " #フォーマットで0埋め\n", " tensor_weight_file_name = f\"model.layers.{weight_file_no:05d}-of-{max_layer_no + 1:05d}.safetensors\"\n", "\n", " # save safetensor\n", " save_file(layer_weights, save_mixtral_dir + \"/\" + tensor_weight_file_name, metadata={\"format\":\"pt\"})\n", " print(\"Save layer weighs\", i, tensor_weight_file_name)\n", "\n", " for key in layer_weights.keys():\n", " mixtral_weight_map[\"weight_map\"][key] = tensor_weight_file_name\n", "\n", " print(i, tensor_weight_file_name)\n", "\n", "# set total size\n", "mixtral_weight_map[\"metadata\"][\"total_size\"] = total_size\n", "\n", "# save model.safetensors.index.json\n", "with open(save_mixtral_dir + \"/model.safetensors.index.json\", \"w\") as f:\n", " json.dump(mixtral_weight_map, f, indent=2)\n", "\n", "print(mixtral_weight_map)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Lswg1ESdI7q9", "outputId": "170e9c80-7856-4a64-b9d6-2701c650ed68" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "mixtral_num_experts 8 gate_shape[1] 2048 gate_tensor tensor([[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", " ...,\n", " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]])\n", "first lm_head.weight torch.Size([32000, 2048])\n", "first model.embed_tokens.weight torch.Size([32000, 2048])\n", "first model.norm.weight torch.Size([2048])\n", "layer 0 model.layers.0.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 0 model.layers.0.post_attention_layernorm.weight torch.Size([2048])\n", "layer 0 model.layers.0.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 0 model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 0 model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 0 model.layers.0.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 0 gate model.layers.0.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 0 model.layers.00001-of-00022.safetensors\n", "0 model.layers.00001-of-00022.safetensors\n", "layer 1 model.layers.1.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 1 model.layers.1.post_attention_layernorm.weight torch.Size([2048])\n", "layer 1 model.layers.1.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 1 model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 1 model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 1 model.layers.1.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 1 gate model.layers.1.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 1 model.layers.00002-of-00022.safetensors\n", "1 model.layers.00002-of-00022.safetensors\n", "layer 2 model.layers.2.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 2 model.layers.2.post_attention_layernorm.weight torch.Size([2048])\n", "layer 2 model.layers.2.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 2 model.layers.2.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 2 model.layers.2.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 2 model.layers.2.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 2 gate model.layers.2.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 2 model.layers.00003-of-00022.safetensors\n", "2 model.layers.00003-of-00022.safetensors\n", "layer 3 model.layers.3.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 3 model.layers.3.post_attention_layernorm.weight torch.Size([2048])\n", "layer 3 model.layers.3.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 3 model.layers.3.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 3 model.layers.3.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 3 model.layers.3.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 3 gate model.layers.3.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 3 model.layers.00004-of-00022.safetensors\n", "3 model.layers.00004-of-00022.safetensors\n", "layer 4 model.layers.4.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 4 model.layers.4.post_attention_layernorm.weight torch.Size([2048])\n", "layer 4 model.layers.4.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 4 model.layers.4.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 4 model.layers.4.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 4 model.layers.4.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 4 gate model.layers.4.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 4 model.layers.00005-of-00022.safetensors\n", "4 model.layers.00005-of-00022.safetensors\n", "layer 5 model.layers.5.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 5 model.layers.5.post_attention_layernorm.weight torch.Size([2048])\n", "layer 5 model.layers.5.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 5 model.layers.5.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 5 model.layers.5.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 5 model.layers.5.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 5 gate model.layers.5.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 5 model.layers.00006-of-00022.safetensors\n", "5 model.layers.00006-of-00022.safetensors\n", "layer 6 model.layers.6.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 6 model.layers.6.post_attention_layernorm.weight torch.Size([2048])\n", "layer 6 model.layers.6.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 6 model.layers.6.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 6 model.layers.6.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 6 model.layers.6.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 6 gate model.layers.6.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 6 model.layers.00007-of-00022.safetensors\n", "6 model.layers.00007-of-00022.safetensors\n", "layer 7 model.layers.7.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 7 model.layers.7.post_attention_layernorm.weight torch.Size([2048])\n", "layer 7 model.layers.7.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 7 model.layers.7.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 7 model.layers.7.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 7 model.layers.7.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 7 gate model.layers.7.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 7 model.layers.00008-of-00022.safetensors\n", "7 model.layers.00008-of-00022.safetensors\n", "layer 8 model.layers.8.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 8 model.layers.8.post_attention_layernorm.weight torch.Size([2048])\n", "layer 8 model.layers.8.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 8 model.layers.8.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 8 model.layers.8.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 8 model.layers.8.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 8 gate model.layers.8.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 8 model.layers.00009-of-00022.safetensors\n", "8 model.layers.00009-of-00022.safetensors\n", "layer 9 model.layers.9.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 9 model.layers.9.post_attention_layernorm.weight torch.Size([2048])\n", "layer 9 model.layers.9.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 9 model.layers.9.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 9 model.layers.9.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 9 model.layers.9.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 9 gate model.layers.9.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 9 model.layers.00010-of-00022.safetensors\n", "9 model.layers.00010-of-00022.safetensors\n", "layer 10 model.layers.10.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 10 model.layers.10.post_attention_layernorm.weight torch.Size([2048])\n", "layer 10 model.layers.10.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 10 model.layers.10.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 10 model.layers.10.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 10 model.layers.10.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 10 gate model.layers.10.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 10 model.layers.00011-of-00022.safetensors\n", "10 model.layers.00011-of-00022.safetensors\n", "layer 11 model.layers.11.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 11 model.layers.11.post_attention_layernorm.weight torch.Size([2048])\n", "layer 11 model.layers.11.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 11 model.layers.11.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 11 model.layers.11.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 11 model.layers.11.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 11 gate model.layers.11.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 11 model.layers.00012-of-00022.safetensors\n", "11 model.layers.00012-of-00022.safetensors\n", "layer 12 model.layers.12.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 12 model.layers.12.post_attention_layernorm.weight torch.Size([2048])\n", "layer 12 model.layers.12.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 12 model.layers.12.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 12 model.layers.12.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 12 model.layers.12.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 12 gate model.layers.12.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 12 model.layers.00013-of-00022.safetensors\n", "12 model.layers.00013-of-00022.safetensors\n", "layer 13 model.layers.13.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 13 model.layers.13.post_attention_layernorm.weight torch.Size([2048])\n", "layer 13 model.layers.13.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 13 model.layers.13.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 13 model.layers.13.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 13 model.layers.13.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 13 gate model.layers.13.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 13 model.layers.00014-of-00022.safetensors\n", "13 model.layers.00014-of-00022.safetensors\n", "layer 14 model.layers.14.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 14 model.layers.14.post_attention_layernorm.weight torch.Size([2048])\n", "layer 14 model.layers.14.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 14 model.layers.14.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 14 model.layers.14.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 14 model.layers.14.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 14 gate model.layers.14.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 14 model.layers.00015-of-00022.safetensors\n", "14 model.layers.00015-of-00022.safetensors\n", "layer 15 model.layers.15.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 15 model.layers.15.post_attention_layernorm.weight torch.Size([2048])\n", "layer 15 model.layers.15.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 15 model.layers.15.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 15 model.layers.15.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 15 model.layers.15.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 15 gate model.layers.15.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 15 model.layers.00016-of-00022.safetensors\n", "15 model.layers.00016-of-00022.safetensors\n", "layer 16 model.layers.16.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 16 model.layers.16.post_attention_layernorm.weight torch.Size([2048])\n", "layer 16 model.layers.16.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 16 model.layers.16.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 16 model.layers.16.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 16 model.layers.16.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 16 gate model.layers.16.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 16 model.layers.00017-of-00022.safetensors\n", "16 model.layers.00017-of-00022.safetensors\n", "layer 17 model.layers.17.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 17 model.layers.17.post_attention_layernorm.weight torch.Size([2048])\n", "layer 17 model.layers.17.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 17 model.layers.17.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 17 model.layers.17.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 17 model.layers.17.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 17 gate model.layers.17.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 17 model.layers.00018-of-00022.safetensors\n", "17 model.layers.00018-of-00022.safetensors\n", "layer 18 model.layers.18.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 18 model.layers.18.post_attention_layernorm.weight torch.Size([2048])\n", "layer 18 model.layers.18.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 18 model.layers.18.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 18 model.layers.18.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 18 model.layers.18.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 18 gate model.layers.18.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 18 model.layers.00019-of-00022.safetensors\n", "18 model.layers.00019-of-00022.safetensors\n", "layer 19 model.layers.19.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 19 model.layers.19.post_attention_layernorm.weight torch.Size([2048])\n", "layer 19 model.layers.19.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 19 model.layers.19.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 19 model.layers.19.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 19 model.layers.19.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 19 gate model.layers.19.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 19 model.layers.00020-of-00022.safetensors\n", "19 model.layers.00020-of-00022.safetensors\n", "layer 20 model.layers.20.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 20 model.layers.20.post_attention_layernorm.weight torch.Size([2048])\n", "layer 20 model.layers.20.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 20 model.layers.20.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 20 model.layers.20.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 20 model.layers.20.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 20 gate model.layers.20.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 20 model.layers.00021-of-00022.safetensors\n", "20 model.layers.00021-of-00022.safetensors\n", "layer 21 model.layers.21.input_layernorm.weight torch.Size([2048])\n", "gen experts\n", "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", "gen experts\n", "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", "gen experts\n", "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", "layer 21 model.layers.21.post_attention_layernorm.weight torch.Size([2048])\n", "layer 21 model.layers.21.self_attn.k_proj.weight torch.Size([256, 2048])\n", "layer 21 model.layers.21.self_attn.o_proj.weight torch.Size([2048, 2048])\n", "layer 21 model.layers.21.self_attn.q_proj.weight torch.Size([2048, 2048])\n", "layer 21 model.layers.21.self_attn.v_proj.weight torch.Size([256, 2048])\n", "layer 21 gate model.layers.21.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", "Save layer weighs 21 model.layers.00022-of-00022.safetensors\n", "21 model.layers.00022-of-00022.safetensors\n", "{'metadata': {'total_size': 12859265024}, 'weight_map': {'lm_head.weight': 'model.layers.00001-of-00022.safetensors', 'model.embed_tokens.weight': 'model.layers.00001-of-00022.safetensors', 'model.norm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.input_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.post_attention_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.k_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.o_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.q_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.v_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.gate.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.1.input_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.post_attention_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.k_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.o_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.q_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.v_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.gate.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.2.input_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.post_attention_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.k_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.o_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.q_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.v_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.gate.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.3.input_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.post_attention_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.k_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.o_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.q_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.v_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.gate.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.4.input_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.post_attention_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.k_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.o_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.q_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.v_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.gate.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.5.input_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.post_attention_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.k_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.o_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.q_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.v_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.gate.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.6.input_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.post_attention_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.k_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.o_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.q_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.v_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.gate.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.7.input_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.post_attention_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.k_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.o_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.q_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.v_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.gate.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.8.input_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.post_attention_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.k_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.o_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.q_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.v_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.gate.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.9.input_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.post_attention_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.k_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.o_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.q_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.v_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.gate.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.10.input_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.post_attention_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.k_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.o_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.q_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.v_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.gate.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.11.input_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.post_attention_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.k_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.o_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.q_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.v_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.gate.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.12.input_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.post_attention_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.k_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.o_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.q_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.v_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.gate.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.13.input_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.post_attention_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.k_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.o_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.q_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.v_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.gate.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.14.input_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.post_attention_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.k_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.o_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.q_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.v_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.gate.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.15.input_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.post_attention_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.k_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.o_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.q_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.v_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.gate.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.16.input_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.post_attention_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.k_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.o_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.q_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.v_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.gate.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.17.input_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.post_attention_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.k_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.o_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.q_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.v_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.gate.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.18.input_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.post_attention_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.k_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.o_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.q_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.v_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.gate.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.19.input_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.post_attention_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.k_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.o_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.q_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.v_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.gate.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.20.input_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.post_attention_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.k_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.o_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.q_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.v_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.gate.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.21.input_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.post_attention_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.k_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.o_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.q_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.v_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.gate.weight': 'model.layers.00022-of-00022.safetensors'}}\n" ] } ] }, { "cell_type": "code", "source": [ "# check model\n", "mx_model, mx_tok = test_gen(save_mixtral_dir)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "ae1fb4b51ee2457998f8066635edcc14", "347c9d512b454f4781f0dbbfda6c3724", "a458afb6c9b94046bc552a50ada0d867", "f9062f94c8ee41febe6717fcbcb5053f", "a79cec942e034086927cfda8e9d8afca", "f30a519a167249b285fa7cf44e5565bc", "dba40b83726b4fdaa5d5c8cb1c4d3c3b", "65348e69ef7440038f107f5e8efa3b7e", "ddde5ed5da554643aaea09e0bb0b797a", "c138ccb796ff4c1a81ccce900bdec068", "558ca035522c405fba9c7a3f9e8eb135" ] }, "id": "DCs_uVxCvCwR", "outputId": "73663da6-0141-48fa-936b-82598865d27a" }, "execution_count": 9, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Loading checkpoint shards: 0%| | 0/22 [00:00 [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] (https://diet-d.com/recipes/mayonnaise-recipe-recipes-for-chicken-or-fish-on-easy-mayonaise-veggie-eggs-dont-use-sweetness.html) are you thinking of substituting these with a recipe that calls for mayonnaise, though?\n", "cheesecake recipes with a mayonnaise instead of the oil? For how many pounds? May I suggest you substitute the mayonnaise with a cream? (and olive oil instead of soybean oil?\n", "------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "# from google.colab import userdata\n", "# !huggingface-cli login --token {userdata.get('HUGGINGFACE_ACCESS_TOKEN')}" ], "metadata": { "id": "X1jZZ3ggwX9x" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# huggingface_repo = \"TinyMixtral-x8-Clonebase-7b\"\n", "# mx_model.push_to_hub(huggingface_repo, private=True)\n", "# mx_tok.push_to_hub(huggingface_repo, private=True)" ], "metadata": { "id": "asht1d6Fws_P" }, "execution_count": null, "outputs": [] } ] }