import os import sys from datasets import load_dataset, load_from_disk, concatenate_datasets from transformers import PreTrainedTokenizerFast import transformers from transformers import ( AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, ) from transformers.trainer_utils import get_last_checkpoint from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel from transformers import GPT2Model from transformers import GPT2TokenizerFast import transformers import torch import numpy as np import argparse parser = argparse.ArgumentParser() parser.add_argument('test', type=int) parser.add_argument('length', type=int) #parser.add_argument('--input_file', type=int) args = parser.parse_args() def compute_metrics(eval_pred): logits,labels = eval_pred predictions = np.zeros(logits.shape) predictions[np.arange(len(predictions)),logits.argmax(1)] = 1 predictions = predictions > 0.5 #predictions = logits > 0.5 labels = labels > 0.5 return {"acc":np.all(predictions == labels,axis=1).sum()/predictions.shape[0]} def compute_metrics_regression(eval_pred): logits,labels = eval_pred labels = np.expand_dims(labels,1) val = np.abs(logits-labels).mean() perc = ((np.abs(logits-labels).round() < 1).sum()*100) / (len(labels)) perc_50 = ((np.abs(logits-labels).round()[0:50] < 1).sum()*100) / (50) return {"dev":val,"perc":perc,"perc_50":perc_50} class MultilabelTrainer(Trainer): def compute_loss(self,model,inputs,return_outputs=False): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs.logits loss_fct = torch.nn.BCEWithLogitsLoss() loss = loss_fct(logits.view(-1,self.model.config.num_labels), labels.float().view(-1,self.model.config.num_labels)) return (loss,outputs) if return_outputs else loss def main(): ds_names = ["yle", "online_reviews","xed","ylilauta"] #ds_sizes = [1000, 3000, 10000, 32000, 9999999] print("test:",args.test) ds_name = ds_names[args.test] ds_size = args.length print(ds_name, ds_size) metric = compute_metrics_regression if ds_name == "online_reviews" else compute_metrics #print("cuda_avail:",torch.cuda.is_available()) #checkpoint_loc = "/media/volume/output/checkpoint-275000" #output_dir = "/media/volume/fi_nlp/output/finetune" #checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400" output_dir = "/scratch/project_462000007/hatanpav/output/dippa/gpt/"+ds_name training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=4, per_device_eval_batch_size=4, learning_rate=5e-6, adam_beta1=0.95, adam_beta2=0.985, adam_epsilon=1e-8, weight_decay=0.001, lr_scheduler_type="linear", gradient_accumulation_steps=2,#This one assumes 4x8 GPUs. Set to 64 to get global batch size of 64 with one GPU max_steps=10000, num_train_epochs=20000,#Overriden by max_steps save_total_limit=2, dataloader_num_workers=5, save_steps=100000, warmup_steps=500, do_eval=True, eval_steps=500, evaluation_strategy="steps", logging_strategy="steps", logging_steps=50, fp16_opt_level="O2", half_precision_backend="amp", log_on_each_node=False, disable_tqdm=True ) print(training_args) dataset = load_from_disk(r"/path/to/data/"+ds_name) #Handle regression type task: n_labels = 1 trainer_class = MultilabelTrainer try: n_labels = len(dataset["train"][0]["labels"]) except: #The case of label being a float. n_labels = 1 trainer_class = Trainer if ds_size > len(dataset["train"]): ds_size = len(dataset["train"]) model = AutoModelForSequenceClassification.from_pretrained("/checkpoint/loc",num_labels=n_labels) tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc") tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) print("init trainer") train_set = dataset["train"].select(range(ds_size)) test_set = dataset["test"] trainer = trainer_class( model=model, args=training_args, train_dataset=train_set, eval_dataset=test_set, tokenizer=tokenizer, compute_metrics=metric, data_collator=default_data_collator ) checkpoint = None #checkpoint = get_last_checkpoint(output_dir) train_result = trainer.train(resume_from_checkpoint=checkpoint) #trainer.save_state() metrics = trainer.evaluate() print(metrics) trainer.save_model() # Saves the tokenizer too for easy upload if __name__ == "__main__": main()