Qwen
/

Qwen-14B

@@ -172,20 +172,20 @@ For pre-training data, on the one hand, Qwen-14B uses part of the open-source ge
 We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
-| Model              |   MMLU   |  C-Eval  |  GSM8K   |    MATH  | HumanEval |    MBPP   |    BBH   |  CMMLU   |
-|:-------------------|:--------:|:--------:|:--------:|:--------:|:---------:|:---------:|:--------:|:--------:|
-|                    |   5-shot |  5-shot  | 8-shot   |   4-shot | 0-shot    |    3-shot | 3-shot   |  5-shot  |
-| LLaMA2-7B              |   46.8   |   32.5   |   16.7   |   3.3    |   12.8    |   20.8    |   38.2   |   31.8   |
-| LLaMA2-13B             |   55.0   |   41.4   |   29.6   |   5.0    |   18.9    |   30.3    |   45.6   |   38.4   |
-| LLaMA2-34B             |   62.6   |    -     |   42.2   |   6.2    |   22.6    |   33.0    |   44.1   |    -     |
-| ChatGLM2-6B            |   47.9   |   51.7   |   32.4   |   6.5    |     -     |     -     |   33.7   |    -     |
-| InternLM-7B            |   51.0   |   53.4   |   31.2   |   6.3    |   10.4    |   14.0    |   37.0   |   51.8   |
-| InternLM-20B           |   62.1   |   58.8   |   52.6   |   7.9    |   25.6    |   35.6    |   52.5   |   59.0   |
-| Baichuan2-7B           |   54.7   |   56.3   |   24.6   |   5.6    |   18.3    |   24.2    |   41.6   |   57.1   |
-| Baichuan2-13B          |   59.5   |   59.0   |   52.8   |   10.1   |   17.1    |   30.2    |   49.0   |   62.0   |
-| Qwen-7B (original) |   56.7   |   59.6   |   51.6   |   -      |   24.4    |   31.2    |   40.6   |   58.8   |
-| **Qwen-7B**        |   58.2   |   63.5   |   51.7   |   11.6   |   29.9    |   31.6    |   45.0   |   62.2   |
-| **Qwen-14B**       | **66.3** | **72.1** | **61.3** | **24.8** | **32.3**  | **40.8**  | **53.4** | **71.0** |
 ### 长序列评测（Long-Context Evaluation）

 We selected MMLU, C-Eval, GSM8K, MATH, HumanEval, MBPP, BBH, CMMLU, which are currently popular benchmarks, to test the model’s Chinese and English knowledge capabilities, translation, mathematical reasoning, coding and other capabilities. From the following comprehensive evaluation results, we can see that the Qwen model outperform the similarly sized open-source models on all tasks.
+| Model              |   MMLU   |  C-Eval  |  GSM8K   |   MATH   | HumanEval |   MBPP   |   BBH    |  CMMLU   |
+|:-------------------|:--------:|:--------:|:--------:|:--------:|:---------:|:--------:|:--------:|:--------:|
+|                    |  5-shot  |  5-shot  |  8-shot  |  4-shot  |  0-shot   |  3-shot  |  3-shot  |  5-shot  |
+| LLaMA2-7B          |   46.8   |   32.5   |   16.7   |   3.3    |   12.8    |   20.8   |   38.2   |   31.8   |
+| LLaMA2-13B         |   55.0   |   41.4   |   29.6   |   5.0    |   18.9    |   30.3   |   45.6   |   38.4   |
+| LLaMA2-34B         |   62.6   |    -     |   42.2   |   6.2    |   22.6    |   33.0   |   44.1   |    -     |
+| ChatGLM2-6B        |   47.9   |   51.7   |   32.4   |   6.5    |     -     |    -     |   33.7   |    -     |
+| InternLM-7B        |   51.0   |   53.4   |   31.2   |   6.3    |   10.4    |   14.0   |   37.0   |   51.8   |
+| InternLM-20B       |   62.1   |   58.8   |   52.6   |   7.9    |   25.6    |   35.6   |   52.5   |   59.0   |
+| Baichuan2-7B       |   54.7   |   56.3   |   24.6   |   5.6    |   18.3    |   24.2   |   41.6   |   57.1   |
+| Baichuan2-13B      |   59.5   |   59.0   |   52.8   |   10.1   |   17.1    |   30.2   |   49.0   |   62.0   |
+| Qwen-7B (original) |   56.7   |   59.6   |   51.6   |    -     |   24.4    |   31.2   |   40.6   |   58.8   |
+| **Qwen-7B**        |   58.2   |   63.5   |   51.7   |   11.6   |   29.9    |   31.6   |   45.0   |   62.2   |
+| **Qwen-14B**       | **66.3** | **72.1** | **61.3** | **24.8** | **32.3**  | **40.8** | **53.4** | **71.0** |
 ### 长序列评测（Long-Context Evaluation）

tokenization_qwen.py CHANGED Viewed

@@ -27,11 +27,21 @@ IMEND = "<|im_end|>"
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
-SPECIAL_TOKENS = (
-    ENDOFTEXT,
-    IMSTART,
-    IMEND,
-) + EXTRAS
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
@@ -42,6 +52,7 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
@@ -51,20 +62,35 @@ class QWenTokenizer(PreTrainedTokenizer):
         self,
         vocab_file,
         errors="replace",
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.errors = errors  # how to handle errors in decoding
-        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
         self.special_tokens = {
             token: index
-            for index, token in enumerate(
-                SPECIAL_TOKENS, start=len(self.mergeable_ranks)
-            )
         }
         enc = tiktoken.Encoding(
             "Qwen",
             pat_str=PAT_STR,
@@ -89,7 +115,7 @@ class QWenTokenizer(PreTrainedTokenizer):
     def __getstate__(self):
         # for pickle lovers
         state = self.__dict__.copy()
-        del state['tokenizer']
         return state
     def __setstate__(self, state):
@@ -103,7 +129,6 @@ class QWenTokenizer(PreTrainedTokenizer):
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
@@ -126,13 +151,17 @@ class QWenTokenizer(PreTrainedTokenizer):
                 ids.append(self.mergeable_ranks.get(token))
         return ids
-    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         if not special_tokens and new_tokens:
-            raise ValueError('Adding regular tokens is not supported')
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
             if surface_form not in SPECIAL_TOKENS:
-                raise ValueError('Adding unknown special tokens is not supported')
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:

 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
         self,
         vocab_file,
         errors="replace",
+        extra_vocab_file=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
         self.special_tokens = {
             token: index
+            for index, token in SPECIAL_TOKENS
         }
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
         enc = tiktoken.Encoding(
             "Qwen",
             pat_str=PAT_STR,
     def __getstate__(self):
         # for pickle lovers
         state = self.__dict__.copy()
+        del state["tokenizer"]
         return state
     def __setstate__(self, state):
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
                 ids.append(self.mergeable_ranks.get(token))
         return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
         if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
             if surface_form not in SPECIAL_TOKENS:
+                raise ValueError("Adding unknown special tokens is not supported")
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: