import os import sys from datasets import load_dataset, load_from_disk, concatenate_datasets from transformers import PreTrainedTokenizerFast import transformers from transformers import ( AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, ) from transformers.trainer_utils import get_last_checkpoint from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel from transformers import GPT2Model from transformers import GPT2TokenizerFast import transformers import torch import numpy as np import argparse parser = argparse.ArgumentParser() parser.add_argument('test', type=int) parser.add_argument('length', type=int) #parser.add_argument('--input_file', type=int) args = parser.parse_args() def compute_metrics(eval_pred): logits,labels = eval_pred import pickle with open("logits_{}.pickle".format("xed"),"wb") as handle: pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL) with open("labels_{}.pickle".format("xed"),"wb") as handle: pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL) #Continue in a jupyter notebook from here return class MultilabelTrainer(Trainer): def compute_loss(self,model,inputs,return_outputs=False): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs.logits loss_fct = torch.nn.BCEWithLogitsLoss() loss = loss_fct(logits.view(-1,self.model.config.num_labels), labels.float().view(-1,self.model.config.num_labels)) return (loss,outputs) if return_outputs else loss def main(): ds_names = ["yle", "online_review","xed","ylilauta"] #ds_sizes = [1000, 3000, 10000, 32000, 9999999] print("test:",args.test) ds_name = ds_names[args.test] #ds_size = int(args.test.slit()[1]) ds_size = args.length print(ds_name, ds_size) metric = compute_metrics #print("cuda_avail:",torch.cuda.is_available()) #checkpoint_loc = "/media/volume/output/checkpoint-275000" #output_dir = "/media/volume/fi_nlp/output/finetune" #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400" output_dir = "/data/loc/"+ds_name #Most of the parameters not used but lets just pass this to make the Trainer happy... training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=4, per_device_eval_batch_size=4, learning_rate=5e-6, adam_beta1=0.95, adam_beta2=0.985, adam_epsilon=1e-8, weight_decay=0.001, lr_scheduler_type="linear", gradient_accumulation_steps=4, max_steps=10000, num_train_epochs=20000, save_total_limit=2, dataloader_num_workers=5, save_steps=100000, warmup_steps=500, do_eval=True, eval_steps=500, evaluation_strategy="steps", logging_strategy="steps", logging_steps=50, fp16_opt_level="O2", half_precision_backend="amp", log_on_each_node=False, disable_tqdm=True ) print(training_args) dataset = load_from_disk(r"/data_loc/"+ds_name)["test"] #dataset = load_from_disk(r"C:\Users\vin\Documents\Projects\dippa\tests\ylilauta\tokenized_set").train_test_split(test_size=0.1) trainer_class = MultilabelTrainer #print("num_labels",num_labels) model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name) tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name) tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) print("init trainer") trainer = trainer_class( model=model, args=training_args, train_dataset=dataset, eval_dataset=dataset, tokenizer=tokenizer, compute_metrics=metric, data_collator=default_data_collator ) #checkpoint = None #checkpoint = get_last_checkpoint(output_dir) #checkpoint = None #train_result = trainer.train() #trainer.save_state() metrics = trainer.evaluate() print(metrics) #trainer.save_model() # Saves the tokenizer too for easy upload if __name__ == "__main__": main()