import transformers import datasets from transformers import PreTrainedTokenizerFast from transformers import ( GPT2TokenizerFast, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator ) from transformers.trainer_utils import get_last_checkpoint import torch #from transformers.utils.dummy_tokenizers_objects import PreTrainedTokenizerFast #config_name = "C:\\Users\\vin\\Documents\\Projects\\NLP\\kielimalli\\config.json" #tokenizer_file = "C:\\Users\\vin\\Documents\\Projects\\NLP\\models\\tokens.json" #input_dir = "H:\\Data_temp\\tokenized_dataset" #output_dir = "H:\\Data_temp\\checkpoints\\model1" def main(): import os #enable if required by your environment #os.environ["CUDA_VISIBLE_DEVICES"] = "0" #torch.backends.cuda.matmul.allow_tf32 = True #torch.backends.cudnn.allow_tf32 = True config_name = "config_large_bpe.json" tokenizer_files = "/path/to/tokenizer/files" input_dir = "/data/dir" output_dir = "/out/dir" training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=4, per_device_eval_batch_size=4, learning_rate=2.067e-5, lr_scheduler_type="linear", adam_beta1=0.95, adam_beta2=0.985, adam_epsilon=1e-8, weight_decay=0.001, gradient_accumulation_steps=32, num_train_epochs=6.7, save_total_limit=2, dataloader_num_workers=10, save_steps=100, warmup_steps=1000, do_eval=True, eval_steps=1000, evaluation_strategy="steps", logging_strategy="steps", logging_steps=100, bf16=True, tf32=True, fp16_opt_level="O2", half_precision_backend="amp", bf16_full_eval=True ) print("setting up tokenizer...") tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_files) #tokenizer.add_special_tokens({'pad_token': '[PAD]'})#Probably wrong tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) from tokenizers.processors import TemplateProcessing tokenizer._tokenizer.post_processor = TemplateProcessing( single="$0 "+tokenizer.eos_token, pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token, special_tokens=[(tokenizer.eos_token, 0)], ) print("loading model...") config = AutoConfig.from_pretrained(config_name) model = AutoModelForCausalLM.from_config(config) #model = AutoModelForCausalLM.from_pretrained("/checkpoint/dir") if restarting training completely and loading weights from a checkpoints model.gradient_checkpointing_enable() #Optional, affects performance print("loading data...") dataset = datasets.load_from_disk(input_dir) print("starting training...") trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], data_collator=default_data_collator, eval_dataset=dataset["test"].select(range(10000)), #To save time do not evaluate on whole test set during training tokenizer=tokenizer ) #checkpoint = None checkpoint = get_last_checkpoint(output_dir) print("checkpoint:", checkpoint) trainer.train(resume_from_checkpoint=checkpoint) if __name__ == "__main__": main()