Japanese BERT-base (Vaporetto + Unigram)

How to load the tokenizer

Please download the dictionary file for Vaporetto + Unigram from our GitHub repository. Then you can load the tokenizer by specifying the path of the dictionary file to dict_path.

from typing import Optional

from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast

import vaporetto
import textspan

class VaporettoPreTokenizer:
    def __init__(self, unidic_path: str):
        with open(unidic_path, 'rb') as fp:
            model = fp.read()
        self.tokenizer = vaporetto.Vaporetto(model, predict_tags=False)
    
    def tokenize(self, sequence: str) -> list[str]:
        tokens = self.tokenizer.tokenize(sequence)
        return [token.surface() for token in tokens]
    
    def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
        text = str(normalized_string)
        tokens = self.tokenize(text)
        tokens_spans = textspan.get_original_spans(tokens, text)
        return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.custom_split)

# load a pre-tokenizer
pre_tokenizer = VaporettoPreTokenizer("/path/to/bccwj-suw+unidic+tag.model.zst")

# load a tokenizer
dict_path = /path/to/vaporetto_unigram.json
tokenizer = Tokenizer.from_file(dict_path)
tokenizer.post_processor = BertProcessing(
    cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
    sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)

# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    mask_token='[MASK]'
)

# set a pre-tokenizer
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
# Test
test_str = "γ“γ‚“γ«γ‘γ―γ€‚η§γ―ε½’ζ…‹η΄ θ§£ζžε™¨γ«γ€γ„γ¦η ”η©Άγ‚’γ—γ¦γ„γΎγ™γ€‚"
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
# -> ['[CLS]','こ','γ‚“','に','け','は','。','私','は','ε½’','ζ…‹','η΄ ','解','析','器','に','぀い','て','η ”η©Ά','γ‚’','し','て','い','ます','。','[SEP]']

How to load the model

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_vaporetto-unigram")

See our repository for more details!

Downloads last month
16
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Datasets used to train hitachi-nlp/bert-base-japanese_vaporetto-unigram