dhigurashi
commited on
Commit
•
b0f6037
1
Parent(s):
14a911c
support transformers==4.34.0
Browse files- tokenization_plamo.py +6 -14
tokenization_plamo.py
CHANGED
@@ -5,7 +5,6 @@ from shutil import copyfile
|
|
5 |
from typing import Any, Dict, List, Optional, Tuple
|
6 |
|
7 |
import sentencepiece as spm
|
8 |
-
import transformers
|
9 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
10 |
from transformers.utils import logging
|
11 |
|
@@ -35,6 +34,12 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
35 |
kwargs["add_bos_token"] = False
|
36 |
if "add_eos_token" not in kwargs:
|
37 |
kwargs["add_eos_token"] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
super().__init__(
|
40 |
vocab_file=vocab_file,
|
@@ -50,15 +55,6 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
50 |
**kwargs,
|
51 |
)
|
52 |
|
53 |
-
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
54 |
-
self.vocab_file = vocab_file
|
55 |
-
self.add_bos_token = kwargs["add_bos_token"]
|
56 |
-
self.add_eos_token = kwargs["add_eos_token"]
|
57 |
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
58 |
-
self.sp_model.Load(vocab_file)
|
59 |
-
|
60 |
-
self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
|
61 |
-
|
62 |
# the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
|
63 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
|
64 |
|
@@ -155,7 +151,3 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
155 |
fi.write(content_spiece_model)
|
156 |
|
157 |
return (out_vocab_file,)
|
158 |
-
|
159 |
-
|
160 |
-
class PlamoConfig(transformers.LlamaConfig): # type: ignore
|
161 |
-
model_type = "plamo"
|
|
|
5 |
from typing import Any, Dict, List, Optional, Tuple
|
6 |
|
7 |
import sentencepiece as spm
|
|
|
8 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
9 |
from transformers.utils import logging
|
10 |
|
|
|
34 |
kwargs["add_bos_token"] = False
|
35 |
if "add_eos_token" not in kwargs:
|
36 |
kwargs["add_eos_token"] = False
|
37 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
38 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
39 |
+
self.sp_model.Load(vocab_file)
|
40 |
+
self.vocab_file = vocab_file
|
41 |
+
self.add_bos_token = kwargs["add_bos_token"]
|
42 |
+
self.add_eos_token = kwargs["add_eos_token"]
|
43 |
|
44 |
super().__init__(
|
45 |
vocab_file=vocab_file,
|
|
|
55 |
**kwargs,
|
56 |
)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
|
59 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
|
60 |
|
|
|
151 |
fi.write(content_spiece_model)
|
152 |
|
153 |
return (out_vocab_file,)
|
|
|
|
|
|
|
|