yangapku commited on
Commit
f254c87
1 Parent(s): 680a3da

update tokenization.py

Browse files
Files changed (2) hide show
  1. README.md +14 -14
  2. tokenization_qwen.py +44 -15
README.md CHANGED
@@ -172,20 +172,20 @@ For pre-training data, on the one hand, Qwen-14B uses part of the open-source ge
172
 
173
  We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
174
 
175
- | Model | MMLU | C-Eval | GSM8K | MATH | HumanEval | MBPP | BBH | CMMLU |
176
- |:-------------------|:--------:|:--------:|:--------:|:--------:|:---------:|:---------:|:--------:|:--------:|
177
- | | 5-shot | 5-shot | 8-shot | 4-shot | 0-shot | 3-shot | 3-shot | 5-shot |
178
- | LLaMA2-7B | 46.8 | 32.5 | 16.7 | 3.3 | 12.8 | 20.8 | 38.2 | 31.8 |
179
- | LLaMA2-13B | 55.0 | 41.4 | 29.6 | 5.0 | 18.9 | 30.3 | 45.6 | 38.4 |
180
- | LLaMA2-34B | 62.6 | - | 42.2 | 6.2 | 22.6 | 33.0 | 44.1 | - |
181
- | ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 6.5 | - | - | 33.7 | - |
182
- | InternLM-7B | 51.0 | 53.4 | 31.2 | 6.3 | 10.4 | 14.0 | 37.0 | 51.8 |
183
- | InternLM-20B | 62.1 | 58.8 | 52.6 | 7.9 | 25.6 | 35.6 | 52.5 | 59.0 |
184
- | Baichuan2-7B | 54.7 | 56.3 | 24.6 | 5.6 | 18.3 | 24.2 | 41.6 | 57.1 |
185
- | Baichuan2-13B | 59.5 | 59.0 | 52.8 | 10.1 | 17.1 | 30.2 | 49.0 | 62.0 |
186
- | Qwen-7B (original) | 56.7 | 59.6 | 51.6 | - | 24.4 | 31.2 | 40.6 | 58.8 |
187
- | **Qwen-7B** | 58.2 | 63.5 | 51.7 | 11.6 | 29.9 | 31.6 | 45.0 | 62.2 |
188
- | **Qwen-14B** | **66.3** | **72.1** | **61.3** | **24.8** | **32.3** | **40.8** | **53.4** | **71.0** |
189
 
190
 
191
  ### 长序列评测(Long-Context Evaluation)
 
172
 
173
  We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
174
 
175
+ | Model | MMLU | C-Eval | GSM8K | MATH | HumanEval | MBPP | BBH | CMMLU |
176
+ |:-------------------|:--------:|:--------:|:--------:|:--------:|:---------:|:--------:|:--------:|:--------:|
177
+ | | 5-shot | 5-shot | 8-shot | 4-shot | 0-shot | 3-shot | 3-shot | 5-shot |
178
+ | LLaMA2-7B | 46.8 | 32.5 | 16.7 | 3.3 | 12.8 | 20.8 | 38.2 | 31.8 |
179
+ | LLaMA2-13B | 55.0 | 41.4 | 29.6 | 5.0 | 18.9 | 30.3 | 45.6 | 38.4 |
180
+ | LLaMA2-34B | 62.6 | - | 42.2 | 6.2 | 22.6 | 33.0 | 44.1 | - |
181
+ | ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 6.5 | - | - | 33.7 | - |
182
+ | InternLM-7B | 51.0 | 53.4 | 31.2 | 6.3 | 10.4 | 14.0 | 37.0 | 51.8 |
183
+ | InternLM-20B | 62.1 | 58.8 | 52.6 | 7.9 | 25.6 | 35.6 | 52.5 | 59.0 |
184
+ | Baichuan2-7B | 54.7 | 56.3 | 24.6 | 5.6 | 18.3 | 24.2 | 41.6 | 57.1 |
185
+ | Baichuan2-13B | 59.5 | 59.0 | 52.8 | 10.1 | 17.1 | 30.2 | 49.0 | 62.0 |
186
+ | Qwen-7B (original) | 56.7 | 59.6 | 51.6 | - | 24.4 | 31.2 | 40.6 | 58.8 |
187
+ | **Qwen-7B** | 58.2 | 63.5 | 51.7 | 11.6 | 29.9 | 31.6 | 45.0 | 62.2 |
188
+ | **Qwen-14B** | **66.3** | **72.1** | **61.3** | **24.8** | **32.3** | **40.8** | **53.4** | **71.0** |
189
 
190
 
191
  ### 长序列评测(Long-Context Evaluation)
tokenization_qwen.py CHANGED
@@ -27,11 +27,21 @@ IMEND = "<|im_end|>"
27
  # regular texts, the surface forms of special tokens need to be
28
  # as different as possible to minimize the impact
29
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- SPECIAL_TOKENS = (
31
- ENDOFTEXT,
32
- IMSTART,
33
- IMEND,
34
- ) + EXTRAS
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
@@ -42,6 +52,7 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
42
  for token, rank in (line.split() for line in contents.splitlines() if line)
43
  }
44
 
 
45
  class QWenTokenizer(PreTrainedTokenizer):
46
  """QWen tokenizer."""
47
 
@@ -51,20 +62,35 @@ class QWenTokenizer(PreTrainedTokenizer):
51
  self,
52
  vocab_file,
53
  errors="replace",
 
54
  **kwargs,
55
  ):
56
  super().__init__(**kwargs)
57
 
58
- self.errors = errors # how to handle errors in decoding
 
 
59
 
60
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
  self.special_tokens = {
62
  token: index
63
- for index, token in enumerate(
64
- SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
- )
66
  }
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  enc = tiktoken.Encoding(
69
  "Qwen",
70
  pat_str=PAT_STR,
@@ -89,7 +115,7 @@ class QWenTokenizer(PreTrainedTokenizer):
89
  def __getstate__(self):
90
  # for pickle lovers
91
  state = self.__dict__.copy()
92
- del state['tokenizer']
93
  return state
94
 
95
  def __setstate__(self, state):
@@ -103,7 +129,6 @@ class QWenTokenizer(PreTrainedTokenizer):
103
  )
104
  self.tokenizer = enc
105
 
106
-
107
  def __len__(self) -> int:
108
  return self.tokenizer.n_vocab
109
 
@@ -126,13 +151,17 @@ class QWenTokenizer(PreTrainedTokenizer):
126
  ids.append(self.mergeable_ranks.get(token))
127
  return ids
128
 
129
- def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
 
 
 
 
130
  if not special_tokens and new_tokens:
131
- raise ValueError('Adding regular tokens is not supported')
132
  for token in new_tokens:
133
  surface_form = token.content if isinstance(token, AddedToken) else token
134
  if surface_form not in SPECIAL_TOKENS:
135
- raise ValueError('Adding unknown special tokens is not supported')
136
  return 0
137
 
138
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
 
27
  # regular texts, the surface forms of special tokens need to be
28
  # as different as possible to minimize the impact
29
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
+ SPECIAL_START_ID = 151643
32
+ SPECIAL_TOKENS = tuple(
33
+ enumerate(
34
+ (
35
+ (
36
+ ENDOFTEXT,
37
+ IMSTART,
38
+ IMEND,
39
+ )
40
+ + EXTRAS
41
+ ),
42
+ start=SPECIAL_START_ID,
43
+ )
44
+ )
45
 
46
 
47
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
 
52
  for token, rank in (line.split() for line in contents.splitlines() if line)
53
  }
54
 
55
+
56
  class QWenTokenizer(PreTrainedTokenizer):
57
  """QWen tokenizer."""
58
 
 
62
  self,
63
  vocab_file,
64
  errors="replace",
65
+ extra_vocab_file=None,
66
  **kwargs,
67
  ):
68
  super().__init__(**kwargs)
69
 
70
+ # how to handle errors in decoding UTF-8 byte sequences
71
+ # use ignore if you are in streaming inference
72
+ self.errors = errors
73
 
74
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
75
  self.special_tokens = {
76
  token: index
77
+ for index, token in SPECIAL_TOKENS
 
 
78
  }
79
 
80
+ # try load extra vocab from file
81
+ if extra_vocab_file is not None:
82
+ used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
83
+ extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
84
+ for token, index in extra_mergeable_ranks.items():
85
+ if token in self.mergeable_ranks:
86
+ logger.info(f"extra token {token} exists, skipping")
87
+ continue
88
+ if index in used_ids:
89
+ logger.info(f'the index {index} for extra token {token} exists, skipping')
90
+ continue
91
+ self.mergeable_ranks[token] = index
92
+ # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
93
+
94
  enc = tiktoken.Encoding(
95
  "Qwen",
96
  pat_str=PAT_STR,
 
115
  def __getstate__(self):
116
  # for pickle lovers
117
  state = self.__dict__.copy()
118
+ del state["tokenizer"]
119
  return state
120
 
121
  def __setstate__(self, state):
 
129
  )
130
  self.tokenizer = enc
131
 
 
132
  def __len__(self) -> int:
133
  return self.tokenizer.n_vocab
134
 
 
151
  ids.append(self.mergeable_ranks.get(token))
152
  return ids
153
 
154
+ def _add_tokens(
155
+ self,
156
+ new_tokens: Union[List[str], List[AddedToken]],
157
+ special_tokens: bool = False,
158
+ ) -> int:
159
  if not special_tokens and new_tokens:
160
+ raise ValueError("Adding regular tokens is not supported")
161
  for token in new_tokens:
162
  surface_form = token.content if isinstance(token, AddedToken) else token
163
  if surface_form not in SPECIAL_TOKENS:
164
+ raise ValueError("Adding unknown special tokens is not supported")
165
  return 0
166
 
167
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: