update tokenization.py
Browse files- README.md +14 -14
- tokenization_qwen.py +44 -15
README.md
CHANGED
@@ -172,20 +172,20 @@ For pre-training data, on the one hand, Qwen-14B uses part of the open-source ge
|
|
172 |
|
173 |
We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
|
174 |
|
175 |
-
| Model | MMLU | C-Eval | GSM8K |
|
176 |
-
|
177 |
-
| |
|
178 |
-
| LLaMA2-7B
|
179 |
-
| LLaMA2-13B
|
180 |
-
| LLaMA2-34B
|
181 |
-
| ChatGLM2-6B
|
182 |
-
| InternLM-7B
|
183 |
-
| InternLM-20B
|
184 |
-
| Baichuan2-7B
|
185 |
-
| Baichuan2-13B
|
186 |
-
| Qwen-7B (original) | 56.7 | 59.6 | 51.6 |
|
187 |
-
| **Qwen-7B** | 58.2 | 63.5 | 51.7 | 11.6 | 29.9 | 31.6
|
188 |
-
| **Qwen-14B** | **66.3** | **72.1** | **61.3** | **24.8** | **32.3** | **40.8**
|
189 |
|
190 |
|
191 |
### 长序列评测(Long-Context Evaluation)
|
|
|
172 |
|
173 |
We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
|
174 |
|
175 |
+
| Model | MMLU | C-Eval | GSM8K | MATH | HumanEval | MBPP | BBH | CMMLU |
|
176 |
+
|:-------------------|:--------:|:--------:|:--------:|:--------:|:---------:|:--------:|:--------:|:--------:|
|
177 |
+
| | 5-shot | 5-shot | 8-shot | 4-shot | 0-shot | 3-shot | 3-shot | 5-shot |
|
178 |
+
| LLaMA2-7B | 46.8 | 32.5 | 16.7 | 3.3 | 12.8 | 20.8 | 38.2 | 31.8 |
|
179 |
+
| LLaMA2-13B | 55.0 | 41.4 | 29.6 | 5.0 | 18.9 | 30.3 | 45.6 | 38.4 |
|
180 |
+
| LLaMA2-34B | 62.6 | - | 42.2 | 6.2 | 22.6 | 33.0 | 44.1 | - |
|
181 |
+
| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 6.5 | - | - | 33.7 | - |
|
182 |
+
| InternLM-7B | 51.0 | 53.4 | 31.2 | 6.3 | 10.4 | 14.0 | 37.0 | 51.8 |
|
183 |
+
| InternLM-20B | 62.1 | 58.8 | 52.6 | 7.9 | 25.6 | 35.6 | 52.5 | 59.0 |
|
184 |
+
| Baichuan2-7B | 54.7 | 56.3 | 24.6 | 5.6 | 18.3 | 24.2 | 41.6 | 57.1 |
|
185 |
+
| Baichuan2-13B | 59.5 | 59.0 | 52.8 | 10.1 | 17.1 | 30.2 | 49.0 | 62.0 |
|
186 |
+
| Qwen-7B (original) | 56.7 | 59.6 | 51.6 | - | 24.4 | 31.2 | 40.6 | 58.8 |
|
187 |
+
| **Qwen-7B** | 58.2 | 63.5 | 51.7 | 11.6 | 29.9 | 31.6 | 45.0 | 62.2 |
|
188 |
+
| **Qwen-14B** | **66.3** | **72.1** | **61.3** | **24.8** | **32.3** | **40.8** | **53.4** | **71.0** |
|
189 |
|
190 |
|
191 |
### 长序列评测(Long-Context Evaluation)
|
tokenization_qwen.py
CHANGED
@@ -27,11 +27,21 @@ IMEND = "<|im_end|>"
|
|
27 |
# regular texts, the surface forms of special tokens need to be
|
28 |
# as different as possible to minimize the impact
|
29 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
@@ -42,6 +52,7 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
42 |
for token, rank in (line.split() for line in contents.splitlines() if line)
|
43 |
}
|
44 |
|
|
|
45 |
class QWenTokenizer(PreTrainedTokenizer):
|
46 |
"""QWen tokenizer."""
|
47 |
|
@@ -51,20 +62,35 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
51 |
self,
|
52 |
vocab_file,
|
53 |
errors="replace",
|
|
|
54 |
**kwargs,
|
55 |
):
|
56 |
super().__init__(**kwargs)
|
57 |
|
58 |
-
|
|
|
|
|
59 |
|
60 |
-
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type:
|
61 |
self.special_tokens = {
|
62 |
token: index
|
63 |
-
for index, token in
|
64 |
-
SPECIAL_TOKENS, start=len(self.mergeable_ranks)
|
65 |
-
)
|
66 |
}
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
enc = tiktoken.Encoding(
|
69 |
"Qwen",
|
70 |
pat_str=PAT_STR,
|
@@ -89,7 +115,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
89 |
def __getstate__(self):
|
90 |
# for pickle lovers
|
91 |
state = self.__dict__.copy()
|
92 |
-
del state[
|
93 |
return state
|
94 |
|
95 |
def __setstate__(self, state):
|
@@ -103,7 +129,6 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
103 |
)
|
104 |
self.tokenizer = enc
|
105 |
|
106 |
-
|
107 |
def __len__(self) -> int:
|
108 |
return self.tokenizer.n_vocab
|
109 |
|
@@ -126,13 +151,17 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
126 |
ids.append(self.mergeable_ranks.get(token))
|
127 |
return ids
|
128 |
|
129 |
-
def _add_tokens(
|
|
|
|
|
|
|
|
|
130 |
if not special_tokens and new_tokens:
|
131 |
-
raise ValueError(
|
132 |
for token in new_tokens:
|
133 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
134 |
if surface_form not in SPECIAL_TOKENS:
|
135 |
-
raise ValueError(
|
136 |
return 0
|
137 |
|
138 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
|
|
27 |
# regular texts, the surface forms of special tokens need to be
|
28 |
# as different as possible to minimize the impact
|
29 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
30 |
+
# changed to use actual index to avoid misconfiguration with vocabulary expansion
|
31 |
+
SPECIAL_START_ID = 151643
|
32 |
+
SPECIAL_TOKENS = tuple(
|
33 |
+
enumerate(
|
34 |
+
(
|
35 |
+
(
|
36 |
+
ENDOFTEXT,
|
37 |
+
IMSTART,
|
38 |
+
IMEND,
|
39 |
+
)
|
40 |
+
+ EXTRAS
|
41 |
+
),
|
42 |
+
start=SPECIAL_START_ID,
|
43 |
+
)
|
44 |
+
)
|
45 |
|
46 |
|
47 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
|
52 |
for token, rank in (line.split() for line in contents.splitlines() if line)
|
53 |
}
|
54 |
|
55 |
+
|
56 |
class QWenTokenizer(PreTrainedTokenizer):
|
57 |
"""QWen tokenizer."""
|
58 |
|
|
|
62 |
self,
|
63 |
vocab_file,
|
64 |
errors="replace",
|
65 |
+
extra_vocab_file=None,
|
66 |
**kwargs,
|
67 |
):
|
68 |
super().__init__(**kwargs)
|
69 |
|
70 |
+
# how to handle errors in decoding UTF-8 byte sequences
|
71 |
+
# use ignore if you are in streaming inference
|
72 |
+
self.errors = errors
|
73 |
|
74 |
+
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
|
75 |
self.special_tokens = {
|
76 |
token: index
|
77 |
+
for index, token in SPECIAL_TOKENS
|
|
|
|
|
78 |
}
|
79 |
|
80 |
+
# try load extra vocab from file
|
81 |
+
if extra_vocab_file is not None:
|
82 |
+
used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
|
83 |
+
extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
|
84 |
+
for token, index in extra_mergeable_ranks.items():
|
85 |
+
if token in self.mergeable_ranks:
|
86 |
+
logger.info(f"extra token {token} exists, skipping")
|
87 |
+
continue
|
88 |
+
if index in used_ids:
|
89 |
+
logger.info(f'the index {index} for extra token {token} exists, skipping')
|
90 |
+
continue
|
91 |
+
self.mergeable_ranks[token] = index
|
92 |
+
# the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
|
93 |
+
|
94 |
enc = tiktoken.Encoding(
|
95 |
"Qwen",
|
96 |
pat_str=PAT_STR,
|
|
|
115 |
def __getstate__(self):
|
116 |
# for pickle lovers
|
117 |
state = self.__dict__.copy()
|
118 |
+
del state["tokenizer"]
|
119 |
return state
|
120 |
|
121 |
def __setstate__(self, state):
|
|
|
129 |
)
|
130 |
self.tokenizer = enc
|
131 |
|
|
|
132 |
def __len__(self) -> int:
|
133 |
return self.tokenizer.n_vocab
|
134 |
|
|
|
151 |
ids.append(self.mergeable_ranks.get(token))
|
152 |
return ids
|
153 |
|
154 |
+
def _add_tokens(
|
155 |
+
self,
|
156 |
+
new_tokens: Union[List[str], List[AddedToken]],
|
157 |
+
special_tokens: bool = False,
|
158 |
+
) -> int:
|
159 |
if not special_tokens and new_tokens:
|
160 |
+
raise ValueError("Adding regular tokens is not supported")
|
161 |
for token in new_tokens:
|
162 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
163 |
if surface_form not in SPECIAL_TOKENS:
|
164 |
+
raise ValueError("Adding unknown special tokens is not supported")
|
165 |
return 0
|
166 |
|
167 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|