Spaces:

SkyNetWalker
/

HF-LLMs

Running

File size: 5,408 Bytes

1be87ac
 
fc98e77
21a478e
1be87ac
 
21a478e
8037c4b
1be87ac
 
 
 
 
 
 
 
5d7db46
1be87ac
4ca2388
b597dd2
4132916
 
 
 
4ca2388
 
1be87ac
 
 
4ca2388
 
1be87ac
 
 
4ca2388
1be87ac
 
 
4ca2388
 
 
80eed0f
1be87ac
fc98e77
1be87ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ca2388
1be87ac
 
 
 
 
 
 
 
 
 
 
 
4ca2388
4facf91
4ca2388
181f5cd
 
4ca2388
 
 
 
fc98e77
 
40b508f
4ca2388
40b508f
4ca2388
 
 
 
 
 
700ffae
 
b597dd2
700ffae
 
 
 
40b508f
700ffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132916
 
700ffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40b508f
1be87ac
 
 
 
fc98e77

#refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main

import gradio as gr
from openai import OpenAI
import os

ACCESS_TOKEN = os.getenv("myHFtoken")

print("Access token loaded.")

client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)

print("Client initialized.")

SYSTEM_PROMPTS = {
    "zh-HK": "用香港的廣東話(Cantonese)對話. No chatty. Answer in simple but accurate way.",
    "zh-TW": "Chat by Traditional Chinese language of Taiwan (zh-TW). No chatty. Answer in simple but accurate way.",
    "EN: General Assistant": "You are a helpful, respectful and honest assistant. Always provide accurate information and admit when you're not sure about something.",
    "EN: Code Helper": "You are a programming assistant. Help users with coding questions, debugging, and best practices. Provide clear explanations and code examples when appropriate.",
    "EN: Creative Writer": "You are a creative writing assistant. Help users with storytelling, character development, and creative writing techniques. Be imaginative and encouraging."
}

def respond(
    message,
    history: list[tuple[str, str]],
    preset_prompt,
    custom_prompt,
    max_tokens,
    temperature,
    top_p,
    model_name,
):
    print(f"Received message: {message}")
    print(f"History: {history}")
    
    system_message = custom_prompt if custom_prompt.strip() else SYSTEM_PROMPTS[preset_prompt]
    
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Selected model: {model_name}")

    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
            print(f"Added user message to context: {val[0]}")
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
            print(f"Added assistant message to context: {val[1]}")

    messages.append({"role": "user", "content": message})

    response = ""
    print("Sending request to OpenAI API.")
    
    for message in client.chat.completions.create(
        model=model_name,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        messages=messages,
    ):
        token = message.choices[0].delta.content
        print(f"Received token: {token}")
        response += token
        yield response

    print("Completed response generation.")

models = [
    "PowerInfer/SmallThinker-3B-Preview",
    "NovaSky-AI/Sky-T1-32B-Preview",
    "microsoft/phi-4",
    "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "microsoft/Phi-3-mini-128k-instruct",
]

with gr.Blocks() as demo:
    gr.Markdown("# LLM Test")
    
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=models, 
            value=models[0], 
            label="Select Model:"
        )

    # Create the chat components separately
    chatbot = gr.Chatbot(height=500)
    msg = gr.Textbox(
        show_label=False,
        placeholder="Enter text and press enter",
        container=False
    )
    clear = gr.Button("Clear")

    # Additional inputs
    with gr.Accordion("Configuration", open=False):
        preset_prompt = gr.Dropdown(
            choices=list(SYSTEM_PROMPTS.keys()),
            value=list(SYSTEM_PROMPTS.keys())[0],
            label="Select System Prompt:"
        )
        custom_prompt = gr.Textbox(
            value="",
            label="Custom System Prompt (leaves blank to use preset):",
            lines=2
        )
        max_tokens = gr.Slider(
            minimum=1,
            maximum=8192,
            value=2048,
            step=1,
            label="Max new tokens:"
        )
        temperature = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.3,
            step=0.1,
            label="Temperature:"
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P:"
        )

    # Set up the chat functionality
    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(
        history,
        preset_prompt,
        custom_prompt,
        max_tokens,
        temperature,
        top_p,
        model_name
    ):
        history[-1][1] = ""
        for character in respond(
            history[-1][0],
            history[:-1],
            preset_prompt,
            custom_prompt,
            max_tokens,
            temperature,
            top_p,
            model_name
        ):
            history[-1][1] = character
            yield history

    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, preset_prompt, custom_prompt, max_tokens, temperature, top_p, model_dropdown],
        chatbot
    )

    clear.click(lambda: None, None, chatbot, queue=False)

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()