File size: 2,467 Bytes
a570747
6e565e2
 
a570747
6e565e2
a570747
 
 
3d82de7
6e565e2
1d780ea
0d716fc
3cef3e3
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
816facc
6e565e2
3cef3e3
 
 
e02989b
3cef3e3
6e565e2
3cef3e3
9810d0f
f05adaf
 
3cef3e3
 
f05adaf
3cef3e3
 
 
6e565e2
9810d0f
 
6e565e2
9810d0f
 
 
6e565e2
f05adaf
6e565e2
3cef3e3
2328b17
 
6b4b78f
2328b17
 
 
 
 
 
49eb74f
3cef3e3
6e565e2
3cef3e3
 
 
6e565e2
 
3cef3e3
 
3d82de7
 
 
 
6e565e2
3cef3e3
 
6e565e2
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Ensure Hugging Face cache directory is writable
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"

# Load dataset
dataset = load_dataset("tatsu-lab/alpaca")
dataset["train"] = dataset["train"].select(range(2000))

# Check dataset structure
print("Dataset splits available:", dataset)
print("Sample row:", dataset["train"][0])

# If no 'test' split exists, create one
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)

# Load tokenizer & model
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_disable() 

# Define tokenization function
def tokenize_function(examples):
    inputs = examples["input"]  # Ensure this matches dataset key
    targets = examples["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#  Assign train & eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("Dataset successfully split and tokenized.")

#  Define training arguments
training_args = TrainingArguments(
    output_dir="/tmp/results",  # Use /tmp/ to avoid permission errors
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_dir="/tmp/logs",  # Avoid writing to restricted directories
    logging_steps=100,
    save_total_limit=2,
    fp16=True
)
# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

save_dir = "/tmp/t5-finetuned"  # Use /tmp/, which is writable
os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists
trainer.save_model(save_dir)  # Save the model

# Start fine-tuning
trainer.train()

print("Fine-tuning complete!")

# Save model locally
trainer.save_model("./t5-finetuned")

print("Model saved successfully!")