Text Generation
Transformers
Safetensors
English
Inference Endpoints
sigma-moe-small / config.json
jubueche's picture
Upload SigmaMoEForCausalLM
3dfe707 verified
{
"_name_or_path": "/dccstor/broccoli/huggingface/transformers/sigma_moe/wikitext/moe/fp",
"activation": "gelu_new",
"activation_after_topk": false,
"architectures": [
"SigmaMoEForCausalLM"
],
"attention_dropout": 0.1,
"d_ff": 2053,
"d_model": 410,
"embd_pdrop": 0.1,
"expert_dropout": 0.0,
"expert_size": 128,
"initializer_range": 0.025,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 512,
"moe_bias": false,
"moe_dropout": 0.0,
"n_experts": 16,
"num_attention_heads": 10,
"num_hidden_layers": 16,
"num_sparse_hidden_layers": 16,
"partial_rotary_factor": 0.5,
"qk_layernorm": false,
"resid_pdrop": 0.1,
"rope_scaling": null,
"rope_theta": 10000.0,
"routing_regularization": 0.001,
"selection_mode": "sigmoid",
"sinkhorn_n_iters": 3,
"sparse_step": 1,
"tie_word_embeddings": false,
"top_k_experts": 4,
"torch_dtype": "float32",
"transformers_version": "4.37.0.dev0",
"use_cache": false,
"v_dim": null,
"vocab_size": 8000,
"weight_std_scale": 1.0
}