File size: 2,809 Bytes
9c3589b
5355d21
9c3589b
66c498d
9c3589b
66c498d
 
 
 
 
 
 
9c3589b
66c498d
 
 
 
 
 
 
 
9c3589b
66c498d
 
ffe537c
 
66c498d
 
ffe537c
 
66c498d
 
 
 
 
 
 
ffe537c
66c498d
 
 
 
 
 
 
9c3589b
66c498d
9c3589b
66c498d
ffe537c
66c498d
ffe537c
 
 
 
66c498d
ffe537c
 
 
 
66c498d
 
ffe537c
66c498d
 
9c3589b
66c498d
9c3589b
 
 
 
 
 
66c498d
 
 
9c3589b
66c498d
 
 
9c3589b
66c498d
9c3589b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

# --- 1. Check CUDA Availability and Set Device ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using device: {device} ({torch.cuda.get_device_name(0)})")
else:
    print("CUDA is not available. Falling back to CPU.")
    device = torch.device("cpu")

# --- 2. Load Tokenizer (with error handling) ---
MODEL_PATH = "sagar007/phi2_25k"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    exit()

# --- 3. Load Base Model (Optimized for GPU) ---
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float16,  # Use float16 on GPU for efficiency
        device_map="auto",  # Automatically distribute model across GPUs
        trust_remote_code=True
    )
except Exception as e:
    print(f"Error loading base model: {e}")
    exit()

# --- 4. Load PEFT Model (Optimized for GPU) ---
try:
    peft_config = PeftConfig.from_pretrained(MODEL_PATH)
    model = PeftModel.from_pretrained(base_model, MODEL_PATH)
except Exception as e:
    print(f"Error loading PEFT model: {e}")
    exit()

# Move model to the GPU
model.to(device)
model.eval()

# --- 5. Generation Function (Optimized for GPU) ---
def generate_response(instruction, max_length=512):
    prompt = f"Instruction: {instruction}\nResponse:"
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Response:")[1].strip()
    except Exception as e:
        print(f"Error during generation: {e}")
        return "Error during response generation."

# --- 6. Gradio Interface ---
def chatbot(message, history):
    response = generate_response(message)
    return response

demo = gr.ChatInterface(
    chatbot,
    title="Fine-tuned Phi-2 Chatbot (GPU)",
    description="This is a chatbot using a fine-tuned version of the Phi-2 model, running on GPU.",
    theme="default",
    examples=[
        "Explain the concept of machine learning.",
        "Write a short story about a robot learning to paint.",
        "What are some effective ways to reduce stress?",
    ],
    cache_examples=False,  # You can enable caching now
)

if __name__ == "__main__":
    demo.launch()