Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
4159c04
·
verified ·
1 Parent(s): ded915c

Update gptx_tokenizer.py

Browse files
Files changed (1) hide show
  1. gptx_tokenizer.py +17 -25
gptx_tokenizer.py CHANGED
@@ -11,13 +11,8 @@ from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_c
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
14
- # Define special tokens used in the tokenizer
15
- EOD_TOKEN = "<eod>"
16
- PAD_TOKEN = "<pad>"
17
- BOS_TOKEN = "<s>"
18
- EOS_TOKEN = "</s>"
19
- UNK_TOKEN = "<unk>"
20
- REPO_ID = "openGPT-X/Teuken-7B-instruct-research-v0.4"
21
 
22
  class HFGPTXTokenizer(PreTrainedTokenizer):
23
  """
@@ -141,7 +136,6 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
141
  return tokenizer_config_file_or_name
142
  except Exception as e:
143
  raise OSError(f"Failed to download tokenizer model: {str(e)}")
144
-
145
  def __init__(
146
  self,
147
  model_path: Optional[str] = None,
@@ -172,22 +166,16 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
172
  # Since there is no corresponding mapping for EOS from `tok` in
173
  # HuggingFace, it is treated as an additional special token.
174
  # Same for all other special tokens.
175
- self.eos_token = EOD_TOKEN
176
- self.bos_token = BOS_TOKEN
177
- self.pad_token = PAD_TOKEN
178
-
179
- if not self.additional_special_tokens:
180
- self.additional_special_tokens = [
181
- token
182
- for token in self.create_list_of_special_tokens()
183
- # Filter out the special tokens we added manually.
184
- if token
185
- not in [
186
- self.eos_token,
187
- self.bos_token,
188
- self.pad_token,
189
- ]
190
- ]
191
  if config_path is None:
192
  config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
193
 
@@ -244,6 +232,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
244
  self,
245
  token_ids: Union[List[int], List[List[int]]],
246
  num_threads: Optional[int] = None,
 
247
  ) -> str:
248
  """
249
  Decode a list of token IDs into a string.
@@ -253,7 +242,10 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
253
  Returns:
254
  str: Decoded string.
255
  """
256
- return self.tok.decode(input=token_ids, num_threads=num_threads)
 
 
 
257
 
258
  def _convert_id_to_token(self, index: int) -> str:
259
  """
 
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
14
+
15
+ REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
 
 
 
 
 
16
 
17
  class HFGPTXTokenizer(PreTrainedTokenizer):
18
  """
 
136
  return tokenizer_config_file_or_name
137
  except Exception as e:
138
  raise OSError(f"Failed to download tokenizer model: {str(e)}")
 
139
  def __init__(
140
  self,
141
  model_path: Optional[str] = None,
 
166
  # Since there is no corresponding mapping for EOS from `tok` in
167
  # HuggingFace, it is treated as an additional special token.
168
  # Same for all other special tokens.
169
+
170
+
171
+ self.unk_token = "<unk>"
172
+ self.eos_token = "</s>"
173
+ self.bos_token = "<s>"
174
+ self.pad_token = "<pad>"
175
+ self.eod_token = "<eod>"
176
+
177
+ self.additional_special_tokens = self.create_list_of_special_tokens()
178
+
 
 
 
 
 
 
179
  if config_path is None:
180
  config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
181
 
 
232
  self,
233
  token_ids: Union[List[int], List[List[int]]],
234
  num_threads: Optional[int] = None,
235
+ skip_special_tokens: bool = False,
236
  ) -> str:
237
  """
238
  Decode a list of token IDs into a string.
 
242
  Returns:
243
  str: Decoded string.
244
  """
245
+ output = self.tok.decode(input=token_ids, num_threads=num_threads)
246
+ if skip_special_tokens:
247
+ token_ids = [token for token in output if token not in self.additional_special_tokens]
248
+ return output
249
 
250
  def _convert_id_to_token(self, index: int) -> str:
251
  """