File size: 2,108 Bytes
e4dbc4b
 
 
 
 
42b72d5
5a075e9
e4dbc4b
 
 
 
 
 
ac69c70
e4dbc4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a075e9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr

# Adjust this to your model ID
model_id = "decision-oaif/Meta-Llama-3.1-8B-Instruct-sft-intercode-bash-iter1"

# Load model with device map and dtype
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
#model.load_adapter(peft_model_id)

# Load tokenizer and set truncation and padding
tokenizer = AutoTokenizer.from_pretrained(model_id, truncation=True, padding=True)
tokenizer.truncation_side = "left"
tokenizer.padding_side = "left"

# Ensure pad token is set correctly
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def generate_response(messages):
    # Convert list of dicts (messages) into the required format by the tokenizer
    # messages should be a list of {"role": "user"/"assistant", "content": "<text>"}
    
    # Apply the chat template and create the input message
    message = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    
    # Generate response
    outputs = model.generate(
        tokenized_inputs["input_ids"],
        attention_mask=tokenized_inputs["attention_mask"],
        max_new_tokens=256,
        temperature=0.3,
        eos_token_id=[
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        ],
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract the first generated output
    output = outputs[0]
    
    # Decode only the generated tokens, excluding the input part
    response = tokenizer.decode(output[tokenized_inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    
    return response


# Create Gradio interface that takes a list of dicts as input
iface = gr.Interface(fn=generate_response, inputs="json", outputs="text", title="Meta-Llama-3-8B-Instruct")

# Launch the interface
iface.launch()