Spaces:

kuyesu22
/

sunbird-ug

Sleeping

File size: 3,072 Bytes

421412a
b2e8f4f
 
 
6cbb7b3
b2e8f4f
421412a
 
 
 
 
05f8c48
b2e8f4f
1cc6275
b2e8f4f
 
 
 
 
 
 
 
 
 
421412a
1cc6275
 
 
 
b2e8f4f
 
421412a
b2e8f4f
421412a
 
b2e8f4f
 
 
98d724b
1cc6275
b2e8f4f
 
421412a
b2e8f4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cc6275
b2e8f4f
 
 
 
d011e52
b2e8f4f
1cc6275
b2e8f4f

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import os
import gradio as gr

# Login to Hugging Face Hub
access_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
login(token=access_token)


# Define model details
peft_model_id = "kuyesu22/sunbird-ug-lang-v1.0-llama-2-7b-hf-lora"  # Your fine-tuned Llama 2 model ID
config = PeftConfig.from_pretrained(peft_model_id)

# Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,    # Mixed precision for faster inference
    device_map="auto",            # Automatically allocate to available devices
    offload_folder="./offload"    # Directory for offloading layers if needed
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Set the tokenizer's padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as padding if not already defined

# Load the LoRA fine-tuned model
model = PeftModel.from_pretrained(model, peft_model_id)

# Set model to evaluation mode
model.eval()

# Define the inference function for translation
def make_inference(english_text):
    # Format the prompt based on the language pair
    prompt = f"You are English Runyakole language translator, Runyakoleis a language spoke by the bantu speaking people in western Uganda. Can you appropriately translate these user sentences appropriate and must make sense. ### English:\n{english_text}\n\n### Runyankole:"
    batch = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to(model.device)

    # Generate the translation
    with torch.no_grad():
        with torch.cuda.amp.autocast():  # Mixed precision inference for speed
            output_tokens = model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_new_tokens=100,
                do_sample=True,
                temperature=0.7,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
            )

    # Decode the generated tokens to obtain the translation
    translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return translated_text

# Gradio Interface
def launch_gradio_interface():
    inputs = gr.components.Textbox(lines=2, label="English Text")  # Input text in English
    outputs = gr.components.Textbox(label="Translated Runyankole Text")  # Output in Runyankole

    # Launch Gradio app with public sharing link enabled
    gr.Interface(
        fn=make_inference,
        inputs=inputs,
        outputs=outputs,
        title="Dialogue of Delivery Translator",
        description="Translate English to Runyankole using Llama 2 model fine-tuned with LoRA.",
    ).launch(share=True)  # Set `share=True` to create a public link

# Entry point to run the Gradio app
if __name__ == "__main__":
    launch_gradio_interface()