v2ray commited on
Commit
cdc3f9b
·
verified ·
1 Parent(s): 668665b

Removed deprecated.

Browse files
config.json CHANGED
@@ -39,8 +39,10 @@
39
  "qk_rope_head_dim": 64,
40
  "quantization_config": {
41
  "bits": 4,
42
- "group_size": 64,
43
- "modules_to_not_convert": null,
 
 
44
  "quant_method": "awq",
45
  "version": "gemm",
46
  "zero_point": true
@@ -63,7 +65,7 @@
63
  "topk_group": 4,
64
  "topk_method": "noaux_tc",
65
  "torch_dtype": "float16",
66
- "transformers_version": "4.47.1",
67
  "use_cache": false,
68
  "v_head_dim": 128,
69
  "vocab_size": 129280
 
39
  "qk_rope_head_dim": 64,
40
  "quantization_config": {
41
  "bits": 4,
42
+ "group_size": 128,
43
+ "modules_to_not_convert": [
44
+ "self_attn.kv_a_proj_with_mqa"
45
+ ],
46
  "quant_method": "awq",
47
  "version": "gemm",
48
  "zero_point": true
 
65
  "topk_group": 4,
66
  "topk_method": "noaux_tc",
67
  "torch_dtype": "float16",
68
+ "transformers_version": "4.48.0.dev0",
69
  "use_cache": false,
70
  "v_head_dim": 128,
71
  "vocab_size": 129280
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 0,
4
  "do_sample": true,
5
  "eos_token_id": 1,
6
- "transformers_version": "4.47.1"
7
  }
 
3
  "bos_token_id": 0,
4
  "do_sample": true,
5
  "eos_token_id": 1,
6
+ "transformers_version": "4.48.0.dev0"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ad1463dc54628df00279ce2ee0cdc92bb441d921cd8ca4faadaa04c460d5ea9
3
- size 953522768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea1c4769a4c38cd1e604afaa06ee56c9c32c7d9de4a074e646a7083909629c5b
3
+ size 949549016
modeling_deepseek.py CHANGED
@@ -43,7 +43,6 @@ from transformers.modeling_outputs import (
43
  from transformers.modeling_utils import PreTrainedModel
44
  from transformers.pytorch_utils import (
45
  ALL_LAYERNORM_LAYERS,
46
- is_torch_greater_or_equal_than_1_13,
47
  )
48
  from transformers.utils import (
49
  add_start_docstrings,
@@ -66,9 +65,6 @@ if is_flash_attn_2_available():
66
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
67
  # It means that the function will not be traced through and simply appear as a node in the graph.
68
  if is_torch_fx_available():
69
- if not is_torch_greater_or_equal_than_1_13:
70
- import torch.fx
71
-
72
  _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
73
 
74
 
 
43
  from transformers.modeling_utils import PreTrainedModel
44
  from transformers.pytorch_utils import (
45
  ALL_LAYERNORM_LAYERS,
 
46
  )
47
  from transformers.utils import (
48
  add_start_docstrings,
 
65
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
66
  # It means that the function will not be traced through and simply appear as a node in the graph.
67
  if is_torch_fx_available():
 
 
 
68
  _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
69
 
70
 
quant.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from awq import AutoAWQForCausalLM
2
+ from transformers import AutoTokenizer
3
+
4
+ MODEL_PATH = "DeepSeek-V3-1B-Test"
5
+ QUANT_PATH = "DeepSeek-V3-1B-Test-AWQ"
6
+ QUANT_CONFIG = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM", "modules_to_not_convert": ["self_attn.kv_a_proj_with_mqa"]}
7
+
8
+ def main():
9
+ model = AutoAWQForCausalLM.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True, use_cache=False)
10
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, legacy=True)
11
+ model.quantize(
12
+ tokenizer,
13
+ quant_config=QUANT_CONFIG,
14
+ )
15
+ model.save_quantized(QUANT_PATH)
16
+ tokenizer.save_pretrained(QUANT_PATH)
17
+ print(f"Model is quantized and saved at \"{QUANT_PATH}\".")
18
+
19
+ if __name__ == "__main__":
20
+ main()