import gradio as gr
import os
from openai import OpenAI
################################################
# INITIAL SETUP
################################################
# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
# Our main response-generating function
def respond(
user_message,
history,
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model,
custom_model
):
"""
This function handles the chatbot response. It takes in:
- user_message: the user's new message
- history: the list of previous messages, each as [user_text, assistant_text]
- system_message: the system prompt
- max_tokens: the maximum number of tokens to generate in the response
- temperature: sampling temperature
- top_p: top-p (nucleus) sampling
- frequency_penalty: penalize repeated tokens in the output
- seed: a fixed seed for reproducibility; -1 will mean 'random'
- featured_model: the user-chosen model from the radio button
- custom_model: a user-specified custom model that overrides featured_model if not empty
"""
print(f"New user message: {user_message}")
print(f"History so far: {history}")
print(f"System message: {system_message}")
print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}")
print(f"frequency_penalty: {frequency_penalty}, seed: {seed}")
print(f"Featured Model: {featured_model}")
print(f"Custom Model: {custom_model}")
# Convert seed to None if -1 (meaning random)
if seed == -1:
seed = None
# Determine which model to use
# If the user typed something in custom_model, that overrides the featured model
# Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..."
model_to_use = None
if custom_model.strip():
model_to_use = custom_model.strip()
elif featured_model is not None and featured_model.strip():
model_to_use = featured_model.strip()
else:
model_to_use = "meta-llama/Llama-3.3-70B-Instruct"
print(f"Model selected for inference: {model_to_use}")
# Construct the conversation messages for the HF Inference API
messages = [{"role": "system", "content": system_message}]
for user_text, assistant_text in history:
if user_text:
messages.append({"role": "user", "content": user_text})
if assistant_text:
messages.append({"role": "assistant", "content": assistant_text})
messages.append({"role": "user", "content": user_message})
# We'll collect and stream the response
response_so_far = ""
# Make the streaming request to the HF Inference API
print("Sending request to OpenAI/Hugging Face Inference API...")
for message_chunk in client.chat.completions.create(
model=model_to_use,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
# The content for the partial chunk
token_text = message_chunk.choices[0].delta.content
response_so_far += token_text
# Return partial response to Gradio to display in real-time
yield response_so_far
print("Completed response generation.")
################################################
# GRADIO UI + STATE MANAGEMENT
################################################
def user_submit(user_message, history):
"""
This function is called when the user sends a message.
We simply add the user message to the conversation history.
"""
print("user_submit triggered.")
# Append the new user message to history
if not history:
history = []
history = history + [[user_message, None]]
return history, ""
def bot_reply(history, system_message, max_tokens, temperature, top_p,
frequency_penalty, seed, featured_model, custom_model):
"""
This function is triggered to produce the bot's response after the user has submitted.
We call 'respond' for streaming text.
"""
print("bot_reply triggered.")
# The last conversation item has user_message, None
user_message = history[-1][0]
# We will stream the partial responses from 'respond'
bot_response = respond(
user_message=user_message,
history=history[:-1], # all items except the last, because we pass the last user msg separately
system_message=system_message,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
featured_model=featured_model,
custom_model=custom_model
)
# As we yield from the generator, we update the last item in history with the partial response
# Gradio streaming logic: yield the partial updates as they come in
for partial_text in bot_response:
history[-1][1] = partial_text
yield history
# We define a small list of placeholder featured models for demonstration
models_list = [
"meta-llama/Llama-2-13B-Chat-hf",
"bigscience/bloom",
"EleutherAI/gpt-neo-2.7B",
"meta-llama/Llama-3.3-70B-Instruct"
]
def filter_models(search_term):
"""
Filter function triggered when user types in the model_search box.
Returns an updated list of models that contain the search term.
"""
filtered = [m for m in models_list if search_term.lower() in m.lower()]
return gr.update(choices=filtered)
################################################
# BUILDING THE GRADIO LAYOUT
################################################
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
gr.Markdown(
"""
# Serverless-TextGen-Hub
**A UI for text generation using Hugging Face's Inference API.**
Below is a simple chat interface. You can pick from **Featured Models** or specify a **Custom Model**
to override the choice. If you're not sure, just use the default.
"""
)
# State to hold the conversation history, will be a list of [user, bot]
conversation_state = gr.State([])
# Row for system message + advanced settings
with gr.Accordion("Advanced Settings", open=False):
system_message = gr.Textbox(
label="System Message",
value="You are a helpful assistant.",
lines=2,
info="Provides background or personality instructions to the model."
)
max_tokens = gr.Slider(
minimum=1,
maximum=4096,
value=512,
step=1,
label="Max new tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P"
)
frequency_penalty = gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty"
)
seed = gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)"
)
# Featured Models + filtering
with gr.Accordion("Featured Models", open=False):
model_search = gr.Textbox(
label="Filter Models",
placeholder="Search for a featured model...",
lines=1
)
featured_model_radio = gr.Radio(
label="Select a featured model below",
choices=models_list,
value=models_list[0], # default selection
interactive=True
)
model_search.change(
filter_models,
inputs=model_search,
outputs=featured_model_radio
)
# This is the Custom Model box (overrides Featured Models if not empty)
custom_model = gr.Textbox(
label="Custom Model",
value="",
info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model."
)
# The main Chatbot interface
chatbot = gr.Chatbot(height=600)
# Textbox for the user to type a new message
with gr.Row():
user_input = gr.Textbox(
show_label=False,
placeholder="Type your message here (press enter or click 'Submit')",
lines=2
)
submit_btn = gr.Button("Submit", variant="primary")
# The user submits -> we update the conversation state
submit_btn.click(
fn=user_submit,
inputs=[user_input, conversation_state],
outputs=[conversation_state, user_input],
)
# Then the bot replies, streaming the output
# We pass all required arguments from the advanced settings, plus the model selection boxes
submit_btn.click(
fn=bot_reply,
inputs=[
conversation_state,
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model_radio,
custom_model
],
outputs=[chatbot],
# 'bot_reply' is a generator, so we set streaming=True:
queue=True
)
# We also allow pressing Enter in user_input to do the same thing
user_input.submit(
fn=user_submit,
inputs=[user_input, conversation_state],
outputs=[conversation_state, user_input],
)
user_input.submit(
fn=bot_reply,
inputs=[
conversation_state,
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model_radio,
custom_model
],
outputs=[chatbot],
queue=True
)
gr.HTML("""
Developed by Nymbo. Powered by Hugging Face Inference API.
""") # Finally, launch the app if __name__ == "__main__": print("Launching the Serverless-TextGen-Hub application...") demo.launch()