ibrim's picture
Upload 20 files
b347aa0 verified
raw
history blame contribute delete
882 Bytes
"""
Train our Tokenizers on some data, just to see them in action.
The whole thing runs in ~25 seconds on my laptop.
"""
import os
import time
from minbpe import BasicTokenizer, RegexTokenizer
# open some text and train a vocab of 512 tokens
text = open("tests/taylorswift.txt", "r", encoding="utf-8").read()
# create a directory for models, so we don't pollute the current directory
os.makedirs("models", exist_ok=True)
t0 = time.time()
for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):
# construct the Tokenizer object and kick off verbose training
tokenizer = TokenizerClass()
tokenizer.train(text, 512, verbose=True)
# writes two files in the models directory: name.model, and name.vocab
prefix = os.path.join("models", name)
tokenizer.save(prefix)
t1 = time.time()
print(f"Training took {t1 - t0:.2f} seconds")