Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM | |
import torch | |
import spaces | |
MODEL_PATH = "benhaotang/phi4-qwq-sky-t1" | |
MODEL_URL = f"https://huggingface.co./{MODEL_PATH}" | |
def load_model(): | |
bnb_config = BitsAndBytesConfig( | |
load_in_8bit=False, | |
llm_int8_enable_fp32_cpu_offload=True | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_PATH, | |
device_map="auto", | |
torch_dtype=torch.float16, | |
offload_folder="offload_folder", | |
quantization_config=bnb_config | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
# Create pipeline | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device_map="auto", | |
) | |
return pipe | |
pipe = load_model() | |
def generate_response(prompt, max_length=1024): | |
# Convert prompt into messages format | |
messages = [ | |
{"role": "system", "content": "You are a helpful AI asistent. You always think step by step."}, | |
{"role": "user", "content": prompt} | |
] | |
# Generate response using pipeline | |
outputs = pipe(messages, max_new_tokens=max_length) | |
# Extract the generated text | |
response = outputs[0]["generated_text"] | |
# Since pipeline returns the full conversation, we want to extract just the response | |
# Split by the prompt and take the last part | |
response_only = response.split(prompt)[-1].strip() | |
return response_only | |
demo = gr.Interface( | |
fn=generate_response, | |
inputs=[ | |
gr.Textbox( | |
label="Enter your question", | |
placeholder="Ask me anything...", | |
lines=5 | |
), | |
], | |
outputs=gr.Textbox(label="Response", lines=10), | |
title="benhaotang/phi4-qwq-sky-t1", | |
description=f""" To achieve CoT and science reasoning on small scale | |
Model: [benhaotang/phi4-qwq-sky-t1]({MODEL_URL})""", | |
examples=[ | |
["For a scalar field theory with interaction Lagrangian $\mathcal{L}_{int} = g\phi^3 + \lambda\phi^4$:\n 1.Enumerate all possible 1-loop Feynman diagrams contributing to a 2-to-2 scattering process\n2.For each diagram, write down its corresponding amplitude\n3. Provide Mathematica code to calculate these loop amplitudes\n Please explain your reasoning step by step."] | |
] | |
) | |
demo.launch() |