Spaces:
Sleeping
Sleeping
File size: 2,809 Bytes
9c3589b 5355d21 9c3589b 66c498d 9c3589b 66c498d 9c3589b 66c498d 9c3589b 66c498d ffe537c 66c498d ffe537c 66c498d ffe537c 66c498d 9c3589b 66c498d 9c3589b 66c498d ffe537c 66c498d ffe537c 66c498d ffe537c 66c498d ffe537c 66c498d 9c3589b 66c498d 9c3589b 66c498d 9c3589b 66c498d 9c3589b 66c498d 9c3589b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
# --- 1. Check CUDA Availability and Set Device ---
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"Using device: {device} ({torch.cuda.get_device_name(0)})")
else:
print("CUDA is not available. Falling back to CPU.")
device = torch.device("cpu")
# --- 2. Load Tokenizer (with error handling) ---
MODEL_PATH = "sagar007/phi2_25k"
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
print(f"Error loading tokenizer: {e}")
exit()
# --- 3. Load Base Model (Optimized for GPU) ---
try:
base_model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
torch_dtype=torch.float16, # Use float16 on GPU for efficiency
device_map="auto", # Automatically distribute model across GPUs
trust_remote_code=True
)
except Exception as e:
print(f"Error loading base model: {e}")
exit()
# --- 4. Load PEFT Model (Optimized for GPU) ---
try:
peft_config = PeftConfig.from_pretrained(MODEL_PATH)
model = PeftModel.from_pretrained(base_model, MODEL_PATH)
except Exception as e:
print(f"Error loading PEFT model: {e}")
exit()
# Move model to the GPU
model.to(device)
model.eval()
# --- 5. Generation Function (Optimized for GPU) ---
def generate_response(instruction, max_length=512):
prompt = f"Instruction: {instruction}\nResponse:"
try:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=max_length,
num_return_sequences=1,
temperature=0.7,
top_p=0.9,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response.split("Response:")[1].strip()
except Exception as e:
print(f"Error during generation: {e}")
return "Error during response generation."
# --- 6. Gradio Interface ---
def chatbot(message, history):
response = generate_response(message)
return response
demo = gr.ChatInterface(
chatbot,
title="Fine-tuned Phi-2 Chatbot (GPU)",
description="This is a chatbot using a fine-tuned version of the Phi-2 model, running on GPU.",
theme="default",
examples=[
"Explain the concept of machine learning.",
"Write a short story about a robot learning to paint.",
"What are some effective ways to reduce stress?",
],
cache_examples=False, # You can enable caching now
)
if __name__ == "__main__":
demo.launch() |