Removed deprecated.

Files changed (5) hide show

config.json CHANGED Viewed

@@ -39,8 +39,10 @@
   "qk_rope_head_dim": 64,
   "quantization_config": {
     "bits": 4,
-    "group_size": 64,
-    "modules_to_not_convert": null,
     "quant_method": "awq",
     "version": "gemm",
     "zero_point": true
@@ -63,7 +65,7 @@
   "topk_group": 4,
   "topk_method": "noaux_tc",
   "torch_dtype": "float16",
-  "transformers_version": "4.47.1",
   "use_cache": false,
   "v_head_dim": 128,
   "vocab_size": 129280

   "qk_rope_head_dim": 64,
   "quantization_config": {
     "bits": 4,
+    "group_size": 128,
+    "modules_to_not_convert": [
+      "self_attn.kv_a_proj_with_mqa"
+    ],
     "quant_method": "awq",
     "version": "gemm",
     "zero_point": true
   "topk_group": 4,
   "topk_method": "noaux_tc",
   "torch_dtype": "float16",
+  "transformers_version": "4.48.0.dev0",
   "use_cache": false,
   "v_head_dim": 128,
   "vocab_size": 129280

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 0,
   "do_sample": true,
   "eos_token_id": 1,
-  "transformers_version": "4.47.1"
 }

   "bos_token_id": 0,
   "do_sample": true,
   "eos_token_id": 1,
+  "transformers_version": "4.48.0.dev0"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ad1463dc54628df00279ce2ee0cdc92bb441d921cd8ca4faadaa04c460d5ea9
-size 953522768

 version https://git-lfs.github.com/spec/v1
+oid sha256:ea1c4769a4c38cd1e604afaa06ee56c9c32c7d9de4a074e646a7083909629c5b
+size 949549016

modeling_deepseek.py CHANGED Viewed

@@ -43,7 +43,6 @@ from transformers.modeling_outputs import (
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
-    is_torch_greater_or_equal_than_1_13,
 )
 from transformers.utils import (
     add_start_docstrings,
@@ -66,9 +65,6 @@ if is_flash_attn_2_available():
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
-    if not is_torch_greater_or_equal_than_1_13:
-        import torch.fx
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
 )
 from transformers.utils import (
     add_start_docstrings,
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)

quant.py ADDED Viewed

+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+MODEL_PATH = "DeepSeek-V3-1B-Test"
+QUANT_PATH = "DeepSeek-V3-1B-Test-AWQ"
+QUANT_CONFIG = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM", "modules_to_not_convert": ["self_attn.kv_a_proj_with_mqa"]}
+def main():
+    model = AutoAWQForCausalLM.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True, use_cache=False)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, legacy=True)
+    model.quantize(
+        tokenizer,
+        quant_config=QUANT_CONFIG,
+    )
+    model.save_quantized(QUANT_PATH)
+    tokenizer.save_pretrained(QUANT_PATH)
+    print(f"Model is quantized and saved at \"{QUANT_PATH}\".")
+if __name__ == "__main__":
+    main()