sc2582 commited on
Commit
e4dbc4b
·
verified ·
1 Parent(s): e1ff60d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import gradio as gr
4
+
5
+ # Adjust this to your model ID
6
+ model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
7
+ peft_model_id = "decision-oaif/Meta-Llama-3-8B-Instruct-sft-intercode-python-iter0"
8
+ # Load model with device map and dtype
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ model_id,
11
+ torch_dtype=torch.bfloat16,
12
+ device_map="auto"
13
+ )
14
+ model.load_adapter(peft_model_id)
15
+
16
+ # Load tokenizer and set truncation and padding
17
+ tokenizer = AutoTokenizer.from_pretrained(model_id, truncation=True, padding=True)
18
+ tokenizer.truncation_side = "left"
19
+ tokenizer.padding_side = "left"
20
+
21
+ # Ensure pad token is set correctly
22
+ if tokenizer.pad_token is None:
23
+ tokenizer.pad_token = tokenizer.eos_token
24
+
25
+ def generate_response(messages):
26
+ # Convert list of dicts (messages) into the required format by the tokenizer
27
+ # messages should be a list of {"role": "user"/"assistant", "content": "<text>"}
28
+
29
+ message = tokenizer.apply_chat_template(
30
+ messages, tokenize=False, add_generation_prompt=True
31
+ )
32
+
33
+ # Tokenize inputs
34
+ tokenized_inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
35
+
36
+ # Generate response
37
+ outputs = model.generate(
38
+ tokenized_inputs["input_ids"],
39
+ attention_mask=tokenized_inputs["attention_mask"],
40
+ max_new_tokens=1024,
41
+ temperature=0.3,
42
+ eos_token_id=tokenizer.eos_token_id,
43
+ pad_token_id=tokenizer.eos_token_id
44
+ )
45
+
46
+ # Decode and return output
47
+ output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
+ return output_text
49
+
50
+ def generate_response(messages):
51
+ # Convert list of dicts (messages) into the required format by the tokenizer
52
+ # messages should be a list of {"role": "user"/"assistant", "content": "<text>"}
53
+
54
+ # Apply the chat template and create the input message
55
+ message = tokenizer.apply_chat_template(
56
+ messages, tokenize=False, add_generation_prompt=True
57
+ )
58
+
59
+ # Tokenize inputs
60
+ tokenized_inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
61
+
62
+ # Generate response
63
+ outputs = model.generate(
64
+ tokenized_inputs["input_ids"],
65
+ attention_mask=tokenized_inputs["attention_mask"],
66
+ max_new_tokens=256,
67
+ temperature=0.3,
68
+ eos_token_id=[
69
+ tokenizer.eos_token_id,
70
+ tokenizer.convert_tokens_to_ids("<|eot_id|>"),
71
+ ],
72
+ pad_token_id=tokenizer.eos_token_id
73
+ )
74
+
75
+ # Extract the first generated output
76
+ output = outputs[0]
77
+
78
+ # Decode only the generated tokens, excluding the input part
79
+ response = tokenizer.decode(output[tokenized_inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
80
+
81
+ return response
82
+
83
+
84
+ # Create Gradio interface that takes a list of dicts as input
85
+ iface = gr.Interface(fn=generate_response, inputs="json", outputs="text", title="Meta-Llama-3-8B-Instruct")
86
+
87
+ # Launch the interface
88
+ iface.launch()