import gradio as gr import os from openai import OpenAI ################################################ # INITIAL SETUP ################################################ # Retrieve the access token from the environment variable ACCESS_TOKEN = os.getenv("HF_TOKEN") print("Access token loaded.") # Initialize the OpenAI client with the Hugging Face Inference API endpoint client = OpenAI( base_url="https://api-inference.huggingface.co/v1/", api_key=ACCESS_TOKEN, ) print("OpenAI client initialized.") # Our main response-generating function def respond( user_message, history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, featured_model, custom_model ): """ This function handles the chatbot response. It takes in: - user_message: the user's new message - history: the list of previous messages, each as [user_text, assistant_text] - system_message: the system prompt - max_tokens: the maximum number of tokens to generate in the response - temperature: sampling temperature - top_p: top-p (nucleus) sampling - frequency_penalty: penalize repeated tokens in the output - seed: a fixed seed for reproducibility; -1 will mean 'random' - featured_model: the user-chosen model from the radio button - custom_model: a user-specified custom model that overrides featured_model if not empty """ print(f"New user message: {user_message}") print(f"History so far: {history}") print(f"System message: {system_message}") print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}") print(f"frequency_penalty: {frequency_penalty}, seed: {seed}") print(f"Featured Model: {featured_model}") print(f"Custom Model: {custom_model}") # Convert seed to None if -1 (meaning random) if seed == -1: seed = None # Determine which model to use # If the user typed something in custom_model, that overrides the featured model # Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..." model_to_use = None if custom_model.strip(): model_to_use = custom_model.strip() elif featured_model is not None and featured_model.strip(): model_to_use = featured_model.strip() else: model_to_use = "meta-llama/Llama-3.3-70B-Instruct" print(f"Model selected for inference: {model_to_use}") # Construct the conversation messages for the HF Inference API messages = [{"role": "system", "content": system_message}] for user_text, assistant_text in history: if user_text: messages.append({"role": "user", "content": user_text}) if assistant_text: messages.append({"role": "assistant", "content": assistant_text}) messages.append({"role": "user", "content": user_message}) # We'll collect and stream the response response_so_far = "" # Make the streaming request to the HF Inference API print("Sending request to OpenAI/Hugging Face Inference API...") for message_chunk in client.chat.completions.create( model=model_to_use, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, frequency_penalty=frequency_penalty, seed=seed, messages=messages, ): # The content for the partial chunk token_text = message_chunk.choices[0].delta.content response_so_far += token_text # Return partial response to Gradio to display in real-time yield response_so_far print("Completed response generation.") ################################################ # GRADIO UI + STATE MANAGEMENT ################################################ def user_submit(user_message, history): """ This function is called when the user sends a message. We simply add the user message to the conversation history. """ print("user_submit triggered.") # Append the new user message to history if not history: history = [] history = history + [[user_message, None]] return history, "" def bot_reply(history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, featured_model, custom_model): """ This function is triggered to produce the bot's response after the user has submitted. We call 'respond' for streaming text. """ print("bot_reply triggered.") # The last conversation item has user_message, None user_message = history[-1][0] # We will stream the partial responses from 'respond' bot_response = respond( user_message=user_message, history=history[:-1], # all items except the last, because we pass the last user msg separately system_message=system_message, max_tokens=max_tokens, temperature=temperature, top_p=top_p, frequency_penalty=frequency_penalty, seed=seed, featured_model=featured_model, custom_model=custom_model ) # As we yield from the generator, we update the last item in history with the partial response # Gradio streaming logic: yield the partial updates as they come in for partial_text in bot_response: history[-1][1] = partial_text yield history # We define a small list of placeholder featured models for demonstration models_list = [ "meta-llama/Llama-2-13B-Chat-hf", "bigscience/bloom", "EleutherAI/gpt-neo-2.7B", "meta-llama/Llama-3.3-70B-Instruct" ] def filter_models(search_term): """ Filter function triggered when user types in the model_search box. Returns an updated list of models that contain the search term. """ filtered = [m for m in models_list if search_term.lower() in m.lower()] return gr.update(choices=filtered) ################################################ # BUILDING THE GRADIO LAYOUT ################################################ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: gr.Markdown( """ # Serverless-TextGen-Hub **A UI for text generation using Hugging Face's Inference API.** Below is a simple chat interface. You can pick from **Featured Models** or specify a **Custom Model** to override the choice. If you're not sure, just use the default. """ ) # State to hold the conversation history, will be a list of [user, bot] conversation_state = gr.State([]) # Row for system message + advanced settings with gr.Accordion("Advanced Settings", open=False): system_message = gr.Textbox( label="System Message", value="You are a helpful assistant.", lines=2, info="Provides background or personality instructions to the model." ) max_tokens = gr.Slider( minimum=1, maximum=4096, value=512, step=1, label="Max new tokens" ) temperature = gr.Slider( minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P" ) frequency_penalty = gr.Slider( minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty" ) seed = gr.Slider( minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)" ) # Featured Models + filtering with gr.Accordion("Featured Models", open=False): model_search = gr.Textbox( label="Filter Models", placeholder="Search for a featured model...", lines=1 ) featured_model_radio = gr.Radio( label="Select a featured model below", choices=models_list, value=models_list[0], # default selection interactive=True ) model_search.change( filter_models, inputs=model_search, outputs=featured_model_radio ) # This is the Custom Model box (overrides Featured Models if not empty) custom_model = gr.Textbox( label="Custom Model", value="", info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model." ) # The main Chatbot interface chatbot = gr.Chatbot(height=600) # Textbox for the user to type a new message with gr.Row(): user_input = gr.Textbox( show_label=False, placeholder="Type your message here (press enter or click 'Submit')", lines=2 ) submit_btn = gr.Button("Submit", variant="primary") # The user submits -> we update the conversation state submit_btn.click( fn=user_submit, inputs=[user_input, conversation_state], outputs=[conversation_state, user_input], ) # Then the bot replies, streaming the output # We pass all required arguments from the advanced settings, plus the model selection boxes submit_btn.click( fn=bot_reply, inputs=[ conversation_state, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, featured_model_radio, custom_model ], outputs=[chatbot], # 'bot_reply' is a generator, so we set streaming=True: queue=True ) # We also allow pressing Enter in user_input to do the same thing user_input.submit( fn=user_submit, inputs=[user_input, conversation_state], outputs=[conversation_state, user_input], ) user_input.submit( fn=bot_reply, inputs=[ conversation_state, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, featured_model_radio, custom_model ], outputs=[chatbot], queue=True ) gr.HTML("""

Developed by Nymbo. Powered by Hugging Face Inference API.

""") # Finally, launch the app if __name__ == "__main__": print("Launching the Serverless-TextGen-Hub application...") demo.launch()