Spaces:
Sleeping
Sleeping
import torch | |
from peft import PeftModel, PeftConfig | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from huggingface_hub import login | |
import os | |
import gradio as gr | |
# Login to Hugging Face Hub | |
access_token = os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
login(token=access_token) | |
# Define model details | |
peft_model_id = "kuyesu22/sunbird-ug-lang-v1.0-llama-2-7b-hf-lora" # Your fine-tuned Llama 2 model ID | |
config = PeftConfig.from_pretrained(peft_model_id) | |
# Load base model and tokenizer | |
model = AutoModelForCausalLM.from_pretrained( | |
config.base_model_name_or_path, | |
torch_dtype=torch.float16, # Mixed precision for faster inference | |
device_map="auto", # Automatically allocate to available devices | |
offload_folder="./offload" # Directory for offloading layers if needed | |
) | |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
# Set the tokenizer's padding token | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token # Set EOS token as padding if not already defined | |
# Load the LoRA fine-tuned model | |
model = PeftModel.from_pretrained(model, peft_model_id) | |
# Set model to evaluation mode | |
model.eval() | |
# Define the inference function for translation | |
def make_inference(english_text): | |
# Format the prompt based on the language pair | |
prompt = f"You are English Runyakole language translator, Runyakoleis a language spoke by the bantu speaking people in western Uganda. Can you appropriately translate these user sentences appropriate and must make sense. ### English:\n{english_text}\n\n### Runyankole:" | |
batch = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to(model.device) | |
# Generate the translation | |
with torch.no_grad(): | |
with torch.cuda.amp.autocast(): # Mixed precision inference for speed | |
output_tokens = model.generate( | |
input_ids=batch["input_ids"], | |
attention_mask=batch["attention_mask"], | |
max_new_tokens=100, | |
do_sample=True, | |
temperature=0.7, | |
num_return_sequences=1, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
# Decode the generated tokens to obtain the translation | |
translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
return translated_text | |
# Gradio Interface | |
def launch_gradio_interface(): | |
inputs = gr.components.Textbox(lines=2, label="English Text") # Input text in English | |
outputs = gr.components.Textbox(label="Translated Runyankole Text") # Output in Runyankole | |
# Launch Gradio app with public sharing link enabled | |
gr.Interface( | |
fn=make_inference, | |
inputs=inputs, | |
outputs=outputs, | |
title="Dialogue of Delivery Translator", | |
description="Translate English to Runyankole using Llama 2 model fine-tuned with LoRA.", | |
).launch(share=True) # Set `share=True` to create a public link | |
# Entry point to run the Gradio app | |
if __name__ == "__main__": | |
launch_gradio_interface() | |