File size: 2,906 Bytes
2765be0
df667f5
 
 
2765be0
df667f5
 
2765be0
df667f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2765be0
df667f5
2765be0
e6f0922
 
d3f1e5b
df667f5
d3f1e5b
df667f5
 
 
 
7046ce9
91706fb
f058cc0
d3f1e5b
69913d6
 
f058cc0
 
 
 
90b61ce
91706fb
f058cc0
 
 
90b61ce
2765be0
d3f1e5b
 
 
 
 
 
 
 
 
 
 
df667f5
 
 
 
 
 
 
2765be0
 
06d5510
df667f5
90b61ce
df667f5
 
 
d3f1e5b
df667f5
2765be0
 
df667f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
import torch
import spaces

MODEL_PATH = "benhaotang/phi4-qwq-sky-t1"
MODEL_URL = f"https://huggingface.co./{MODEL_PATH}"

def load_model():
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=False,
        llm_int8_enable_fp32_cpu_offload=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        device_map="auto",
        torch_dtype=torch.float16,
        offload_folder="offload_folder",
        quantization_config=bnb_config
    )
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )
    
    return pipe

pipe = load_model()

@spaces.GPU(duration=110)
def generate_response(prompt, max_length=1024):
    # Create messages with system prompt
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant. You always think step by step."},
        {"role": "user", "content": prompt}
    ]
    
    outputs = pipe(messages, max_new_tokens=max_length)

    # print("Raw output:", outputs) #removed after debugging output format problem is done
    # Extract just the assistant's response
    try:
        # outputs[0]["generated_text"] is already a list of messages
        message_list = outputs[0]["generated_text"]
        # Get the last message (assistant's response)
        assistant_message = message_list[-1]
        if assistant_message["role"] == "assistant":
            return assistant_message["content"]
    except Exception as e:
        # print(f"Error extracting response: {e}")
        # If extraction fails, return the raw output
        return str(outputs[0]["generated_text"])
        
    return outputs[0]["generated_text"]

# Example with proper line breaks
example_prompt = """For a scalar field theory with interaction Lagrangian $\mathcal{L}_{int} = g\phi^3 + \lambda\phi^4$:

1. Enumerate all possible 1-loop Feynman diagrams contributing to the scalar propagator

2. For each diagram, write down its loop contribution

3. Provide Mathematica code to calculate these loop amplitudes with dimensional regularization at $d=4-\epsilon$

Please explain your reasoning step by step."""

demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(
            label="Enter your question",
            placeholder="Ask me anything...",
            lines=5
        ),
    ],
    outputs=gr.Textbox(label="Response", lines=10),
    title="benhaotang/phi4-qwq-sky-t1",
    description=f""" To achieve CoT and science reasoning on small scale with a merge of CoT finetuned phi4 model.
    
Model: [benhaotang/phi4-qwq-sky-t1]({MODEL_URL})""",
    examples=[
        [example_prompt]  # Now using the formatted example
    ]
)

demo.launch()