Removed deprecated.
Browse files- config.json +5 -3
- generation_config.json +1 -1
- model.safetensors +2 -2
- modeling_deepseek.py +0 -4
- quant.py +20 -0
config.json
CHANGED
@@ -39,8 +39,10 @@
|
|
39 |
"qk_rope_head_dim": 64,
|
40 |
"quantization_config": {
|
41 |
"bits": 4,
|
42 |
-
"group_size":
|
43 |
-
"modules_to_not_convert":
|
|
|
|
|
44 |
"quant_method": "awq",
|
45 |
"version": "gemm",
|
46 |
"zero_point": true
|
@@ -63,7 +65,7 @@
|
|
63 |
"topk_group": 4,
|
64 |
"topk_method": "noaux_tc",
|
65 |
"torch_dtype": "float16",
|
66 |
-
"transformers_version": "4.
|
67 |
"use_cache": false,
|
68 |
"v_head_dim": 128,
|
69 |
"vocab_size": 129280
|
|
|
39 |
"qk_rope_head_dim": 64,
|
40 |
"quantization_config": {
|
41 |
"bits": 4,
|
42 |
+
"group_size": 128,
|
43 |
+
"modules_to_not_convert": [
|
44 |
+
"self_attn.kv_a_proj_with_mqa"
|
45 |
+
],
|
46 |
"quant_method": "awq",
|
47 |
"version": "gemm",
|
48 |
"zero_point": true
|
|
|
65 |
"topk_group": 4,
|
66 |
"topk_method": "noaux_tc",
|
67 |
"torch_dtype": "float16",
|
68 |
+
"transformers_version": "4.48.0.dev0",
|
69 |
"use_cache": false,
|
70 |
"v_head_dim": 128,
|
71 |
"vocab_size": 129280
|
generation_config.json
CHANGED
@@ -3,5 +3,5 @@
|
|
3 |
"bos_token_id": 0,
|
4 |
"do_sample": true,
|
5 |
"eos_token_id": 1,
|
6 |
-
"transformers_version": "4.
|
7 |
}
|
|
|
3 |
"bos_token_id": 0,
|
4 |
"do_sample": true,
|
5 |
"eos_token_id": 1,
|
6 |
+
"transformers_version": "4.48.0.dev0"
|
7 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea1c4769a4c38cd1e604afaa06ee56c9c32c7d9de4a074e646a7083909629c5b
|
3 |
+
size 949549016
|
modeling_deepseek.py
CHANGED
@@ -43,7 +43,6 @@ from transformers.modeling_outputs import (
|
|
43 |
from transformers.modeling_utils import PreTrainedModel
|
44 |
from transformers.pytorch_utils import (
|
45 |
ALL_LAYERNORM_LAYERS,
|
46 |
-
is_torch_greater_or_equal_than_1_13,
|
47 |
)
|
48 |
from transformers.utils import (
|
49 |
add_start_docstrings,
|
@@ -66,9 +65,6 @@ if is_flash_attn_2_available():
|
|
66 |
# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
|
67 |
# It means that the function will not be traced through and simply appear as a node in the graph.
|
68 |
if is_torch_fx_available():
|
69 |
-
if not is_torch_greater_or_equal_than_1_13:
|
70 |
-
import torch.fx
|
71 |
-
|
72 |
_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
|
73 |
|
74 |
|
|
|
43 |
from transformers.modeling_utils import PreTrainedModel
|
44 |
from transformers.pytorch_utils import (
|
45 |
ALL_LAYERNORM_LAYERS,
|
|
|
46 |
)
|
47 |
from transformers.utils import (
|
48 |
add_start_docstrings,
|
|
|
65 |
# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
|
66 |
# It means that the function will not be traced through and simply appear as a node in the graph.
|
67 |
if is_torch_fx_available():
|
|
|
|
|
|
|
68 |
_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
|
69 |
|
70 |
|
quant.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from awq import AutoAWQForCausalLM
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
MODEL_PATH = "DeepSeek-V3-1B-Test"
|
5 |
+
QUANT_PATH = "DeepSeek-V3-1B-Test-AWQ"
|
6 |
+
QUANT_CONFIG = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM", "modules_to_not_convert": ["self_attn.kv_a_proj_with_mqa"]}
|
7 |
+
|
8 |
+
def main():
|
9 |
+
model = AutoAWQForCausalLM.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True, use_cache=False)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, legacy=True)
|
11 |
+
model.quantize(
|
12 |
+
tokenizer,
|
13 |
+
quant_config=QUANT_CONFIG,
|
14 |
+
)
|
15 |
+
model.save_quantized(QUANT_PATH)
|
16 |
+
tokenizer.save_pretrained(QUANT_PATH)
|
17 |
+
print(f"Model is quantized and saved at \"{QUANT_PATH}\".")
|
18 |
+
|
19 |
+
if __name__ == "__main__":
|
20 |
+
main()
|