Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 10,671 Bytes

import gradio as gr
import os
from openai import OpenAI

################################################
#                INITIAL SETUP
################################################

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

# Our main response-generating function
def respond(
    user_message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    featured_model,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - user_message: the user's new message
    - history: the list of previous messages, each as [user_text, assistant_text]
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - featured_model: the user-chosen model from the radio button
    - custom_model: a user-specified custom model that overrides featured_model if not empty
    """

    print(f"New user message: {user_message}")
    print(f"History so far: {history}")
    print(f"System message: {system_message}")
    print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}")
    print(f"frequency_penalty: {frequency_penalty}, seed: {seed}")
    print(f"Featured Model: {featured_model}")
    print(f"Custom Model: {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Determine which model to use
    # If the user typed something in custom_model, that overrides the featured model
    # Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..."
    model_to_use = None
    if custom_model.strip():
        model_to_use = custom_model.strip()
    elif featured_model is not None and featured_model.strip():
        model_to_use = featured_model.strip()
    else:
        model_to_use = "meta-llama/Llama-3.3-70B-Instruct"

    print(f"Model selected for inference: {model_to_use}")

    # Construct the conversation messages for the HF Inference API
    messages = [{"role": "system", "content": system_message}]
    for user_text, assistant_text in history:
        if user_text:
            messages.append({"role": "user", "content": user_text})
        if assistant_text:
            messages.append({"role": "assistant", "content": assistant_text})
    messages.append({"role": "user", "content": user_message})

    # We'll collect and stream the response
    response_so_far = ""

    # Make the streaming request to the HF Inference API
    print("Sending request to OpenAI/Hugging Face Inference API...")
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # The content for the partial chunk
        token_text = message_chunk.choices[0].delta.content
        response_so_far += token_text
        # Return partial response to Gradio to display in real-time
        yield response_so_far

    print("Completed response generation.")

################################################
#          GRADIO UI + STATE MANAGEMENT
################################################

def user_submit(user_message, history):
    """
    This function is called when the user sends a message.
    We simply add the user message to the conversation history.
    """
    print("user_submit triggered.")
    # Append the new user message to history
    if not history:
        history = []
    history = history + [[user_message, None]]
    return history, ""

def bot_reply(history, system_message, max_tokens, temperature, top_p,
              frequency_penalty, seed, featured_model, custom_model):
    """
    This function is triggered to produce the bot's response after the user has submitted.
    We call 'respond' for streaming text.
    """
    print("bot_reply triggered.")

    # The last conversation item has user_message, None
    user_message = history[-1][0]

    # We will stream the partial responses from 'respond'
    bot_response = respond(
        user_message=user_message,
        history=history[:-1],  # all items except the last, because we pass the last user msg separately
        system_message=system_message,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        featured_model=featured_model,
        custom_model=custom_model
    )

    # As we yield from the generator, we update the last item in history with the partial response
    # Gradio streaming logic: yield the partial updates as they come in
    for partial_text in bot_response:
        history[-1][1] = partial_text
        yield history

# We define a small list of placeholder featured models for demonstration
models_list = [
    "meta-llama/Llama-2-13B-Chat-hf",
    "bigscience/bloom",
    "EleutherAI/gpt-neo-2.7B",
    "meta-llama/Llama-3.3-70B-Instruct"
]

def filter_models(search_term):
    """
    Filter function triggered when user types in the model_search box.
    Returns an updated list of models that contain the search term.
    """
    filtered = [m for m in models_list if search_term.lower() in m.lower()]
    return gr.update(choices=filtered)


################################################
#        BUILDING THE GRADIO LAYOUT
################################################

with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    gr.Markdown(
        """
        # Serverless-TextGen-Hub
        **A UI for text generation using Hugging Face's Inference API.**
        
        Below is a simple chat interface. You can pick from **Featured Models** or specify a **Custom Model** 
        to override the choice. If you're not sure, just use the default.
        """
    )

    # State to hold the conversation history, will be a list of [user, bot]
    conversation_state = gr.State([])

    # Row for system message + advanced settings
    with gr.Accordion("Advanced Settings", open=False):
        system_message = gr.Textbox(
            label="System Message",
            value="You are a helpful assistant.",
            lines=2,
            info="Provides background or personality instructions to the model."
        )
        max_tokens = gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens"
        )
        temperature = gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P"
        )
        frequency_penalty = gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty"
        )
        seed = gr.Slider(
            minimum=-1,
            maximum=65535,
            value=-1,
            step=1,
            label="Seed (-1 for random)"
        )

    # Featured Models + filtering
    with gr.Accordion("Featured Models", open=False):
        model_search = gr.Textbox(
            label="Filter Models",
            placeholder="Search for a featured model...",
            lines=1
        )
        featured_model_radio = gr.Radio(
            label="Select a featured model below",
            choices=models_list,
            value=models_list[0],  # default selection
            interactive=True
        )
        model_search.change(
            filter_models,
            inputs=model_search,
            outputs=featured_model_radio
        )

    # This is the Custom Model box (overrides Featured Models if not empty)
    custom_model = gr.Textbox(
        label="Custom Model",
        value="",
        info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model."
    )

    # The main Chatbot interface
    chatbot = gr.Chatbot(height=600)

    # Textbox for the user to type a new message
    with gr.Row():
        user_input = gr.Textbox(
            show_label=False,
            placeholder="Type your message here (press enter or click 'Submit')",
            lines=2
        )
        submit_btn = gr.Button("Submit", variant="primary")

    # The user submits -> we update the conversation state
    submit_btn.click(
        fn=user_submit,
        inputs=[user_input, conversation_state],
        outputs=[conversation_state, user_input],
    )

    # Then the bot replies, streaming the output
    # We pass all required arguments from the advanced settings, plus the model selection boxes
    submit_btn.click(
        fn=bot_reply,
        inputs=[
            conversation_state,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            featured_model_radio,
            custom_model
        ],
        outputs=[chatbot],
        # 'bot_reply' is a generator, so we set streaming=True:
        queue=True
    )

    # We also allow pressing Enter in user_input to do the same thing
    user_input.submit(
        fn=user_submit,
        inputs=[user_input, conversation_state],
        outputs=[conversation_state, user_input],
    )
    user_input.submit(
        fn=bot_reply,
        inputs=[
            conversation_state,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            featured_model_radio,
            custom_model
        ],
        outputs=[chatbot],
        queue=True
    )

    gr.HTML("""
    <br>
    <p style='text-align:center;'>
        Developed by <strong>Nymbo</strong>. 
        Powered by <strong>Hugging Face Inference API</strong>.
    </p>
    """)

# Finally, launch the app
if __name__ == "__main__":
    print("Launching the Serverless-TextGen-Hub application...")
    demo.launch()