Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 5,085 Bytes

038f313
 
4c18bfc
038f313
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
 
 
e13eb1b
69b4a5f
038f313
 
 
3a64d68
98674ca
9b9dccd
038f313
e13eb1b
52ad57a
 
 
 
 
 
 
 
 
98674ca
e13eb1b
52ad57a
f7c4208
 
86297f5
52ad57a
 
98674ca
f7c4208
52ad57a
 
 
038f313
e13eb1b
880ced6
f7c4208
 
e13eb1b
 
 
 
 
 
86297f5
e13eb1b
 
 
 
038f313
 
9b9dccd
 
98674ca
 
e13eb1b
038f313
b56d11c
f7c4208
52ad57a
e13eb1b
9b9dccd
038f313
9b9dccd
038f313
 
98674ca
 
86297f5
038f313
f7c4208
86297f5
b56d11c
 
9b9dccd
b56d11c
542c2ac
e13eb1b
f7c4208
52ad57a
e13eb1b
 
 
9b9dccd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52ad57a

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: the user-provided custom model name (if any)
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Custom model: {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Determine which model to use: either custom_model or a default
    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    # Make the streaming request to the HF Inference API via openai-like client
    for message_chunk in client.chat.completions.create(
        model=model_to_use,              # Use either the user-provided custom model or default
        max_tokens=max_tokens,
        stream=True,                     # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        # Yield the partial response to Gradio so it can display in real-time
        yield response

    print("Completed response generation.")

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Create the Gradio ChatInterface
# We add two new sliders for Frequency Penalty, Seed, and now a new "Custom Model" text box.
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P"
        ),
        gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty"
        ),
        gr.Slider(
            minimum=-1,
            maximum=65535,  # Arbitrary upper limit for demonstration
            value=-1,
            step=1,
            label="Seed (-1 for random)"
        ),
        gr.Textbox(
            value="",
            label="Custom Model",
            info="(Optional) Provide a custom Hugging Face model path. This will override the default model if not empty."
        ),
    ],
    fill_height=True,
    chatbot=chatbot,
    theme="Nymbo/Nymbo_Theme",
)
print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()