Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on 8 days ago

Commit

f7c4208

verified ·

1 Parent(s): 7d3730f

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -91

app.py CHANGED Viewed

@@ -22,109 +22,164 @@ def respond(
     top_p,
     frequency_penalty,
     seed,
-    model_selection,
     custom_model
 ):
     """
-    This function handles the chatbot response.
     """
-    selected_model = custom_model if custom_model.strip() != "" else model_selection
-    print(f"Selected model: {selected_model}")
     if seed == -1:
         seed = None
     messages = [{"role": "system", "content": system_message}]
     for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
     response = ""
     for message_chunk in client.chat.completions.create(
-        model=selected_model,
         max_tokens=max_tokens,
-        stream=True,
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
         messages=messages,
     ):
         token_text = message_chunk.choices[0].delta.content
         response += token_text
         yield response
 # Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
-# Define placeholder models
-featured_models = [
     "meta-llama/Llama-3.3-70B-Instruct",
-    "gpt2",
-    "bert-base-uncased",
-    "facebook/bart-base",
-    "google/flan-t5-base"
 ]
 # Create the Gradio ChatInterface
-with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-    gr.Markdown("# Serverless Text Generation Hub")
-    with gr.Tab("Basic Settings"):
-        with gr.Row():
-            with gr.Column():
-                # Textbox for system message
-                system_message = gr.Textbox(value="", label="System message")
-        with gr.Row():
-            with gr.Column():
-                # Model selection
-                with gr.Accordion("Featured Models", open=True):
-                    model_search = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...")
-                    model = gr.Radio(label="Select a model", choices=featured_models, value="meta-llama/Llama-3.3-70B-Instruct")
-                    def filter_models(search_term):
-                        filtered_models = [m for m in featured_models if search_term.lower() in m.lower()]
-                        return gr.update(choices=filtered_models)
-                    model_search.change(filter_models, inputs=model_search, outputs=model)
-        with gr.Row():
-            with gr.Column():
-                # Custom model input
-                custom_model = gr.Textbox(label="Custom Model", placeholder="Enter a custom model name")
-    with gr.Tab("Advanced Settings"):
-        with gr.Row():
-            max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
-            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
-        with gr.Row():
-            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
-            frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
-        with gr.Row():
-            seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
     with gr.Tab("Information"):
         with gr.Accordion("Featured Models", open=False):
-            gr.Markdown(
                 """
                 <table style="width:100%; text-align:center; margin:auto;">
                     <tr>
                         <th>Model Name</th>
-                        <th>Description</th>
                     </tr>
                     <tr>
-                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
-                        <td>Highly capable Llama model</td>
                     </tr>
                     <tr>
-                        <td>gpt2</td>
-                        <td>Generative Pre-trained Transformer 2</td>
                     </tr>
                     <tr>
-                        <td>bert-base-uncased</td>
-                        <td>Bidirectional Encoder Representations from Transformers</td>
                     </tr>
                 </table>
                 """
@@ -132,43 +187,23 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
         with gr.Accordion("Parameters Overview", open=False):
             gr.Markdown(
                 """
-                ## System Message
-                ###### Sets the behavior and tone of the assistant.
-                ## Max New Tokens
-                ###### Determines the maximum length of the response.
-                ## Temperature
-                ###### Controls the randomness of the output. Lower values make the output more deterministic.
-                ## Top-P
-                ###### Used for nucleus sampling. Higher values include more tokens in consideration.
-                ## Frequency Penalty
-                ###### Penalizes the model for repeating the same tokens.
-                ## Seed
-                ###### Ensures reproducibility of results.
                 """
             )
-    # Chat interface
-    demo = gr.ChatInterface(
-        respond,
-        additional_inputs=[
-            system_message,
-            max_tokens,
-            temperature,
-            top_p,
-            frequency_penalty,
-            seed,
-            model,
-            custom_model
-        ],
-        chatbot=chatbot,
-        theme="Nymbo/Nymbo_Theme"
-    )
-if __name__ == "__main__":
-    print("Launching the demo application.")
-    demo.launch()

     top_p,
     frequency_penalty,
     seed,
+    model,
     custom_model
 ):
     """
+    This function handles the chatbot response. It takes in:
+    - message: the user's new message
+    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
+    - system_message: the system prompt
+    - max_tokens: the maximum number of tokens to generate in the response
+    - temperature: sampling temperature
+    - top_p: top-p (nucleus) sampling
+    - frequency_penalty: penalize repeated tokens in the output
+    - seed: a fixed seed for reproducibility; -1 will mean 'random'
+    - model: the selected model
+    - custom_model: a custom model provided by the user
     """
+    print(f"Received message: {message}")
+    print(f"History: {history}")
+    print(f"System message: {system_message}")
+    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
+    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    print(f"Model: {model}, Custom Model: {custom_model}")
+    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
+    # Use custom model if provided, otherwise use selected model
+    if custom_model.strip() != "":
+        model_to_use = custom_model.strip()
+    else:
+        model_to_use = model
+    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
+    # Add conversation history to the context
     for val in history:
+        user_part = val[0]
+        assistant_part = val[1]
+        if user_part:
+            messages.append({"role": "user", "content": user_part})
+            print(f"Added user message to context: {user_part}")
+        if assistant_part:
+            messages.append({"role": "assistant", "content": assistant_part})
+            print(f"Added assistant message to context: {assistant_part}")
+    # Append the latest user message
     messages.append({"role": "user", "content": message})
+    # Start with an empty string to build the response as tokens stream in
     response = ""
+    print("Sending request to OpenAI API.")
+    # Make the streaming request to the HF Inference API via openai-like client
     for message_chunk in client.chat.completions.create(
+        model=model_to_use,   # Use the selected or custom model
         max_tokens=max_tokens,
+        stream=True,  # Stream the response
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
         messages=messages,
     ):
+        # Extract the token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
+        print(f"Received token: {token_text}")
         response += token_text
         yield response
+    print("Completed response generation.")
 # Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
+print("Chatbot interface created.")
+# List of placeholder models for demonstration
+models_list = [
     "meta-llama/Llama-3.3-70B-Instruct",
+    "meta-llama/Llama-2-70B-chat",
+    "google/flan-t5-xl"
 ]
+# Function to filter models based on search input
+def filter_models(search_term):
+    filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
+    return gr.update(choices=filtered_models)
 # Create the Gradio ChatInterface
+# Adding additional fields for model selection and parameters
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(value="", label="System message"),
+        gr.Slider(minimum=1,   maximum=4096, value=512, step=1,   label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0,  value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0,  value=0.95, step=0.05, label="Top-P"),
+        gr.Slider(
+            minimum=-2.0,
+            maximum=2.0,
+            value=0.0,
+            step=0.1,
+            label="Frequency Penalty"
+        ),
+        gr.Slider(
+            minimum=-1,
+            maximum=65535,  # Arbitrary upper limit for demonstration
+            value=-1,
+            step=1,
+            label="Seed (-1 for random)"
+        ),
+        gr.Textbox(label="Custom Model", placeholder="Enter custom model path here"),
+        gr.Accordion("Featured Models", open=True).update(
+            gr.Column([
+                gr.Textbox(label="Filter Models", placeholder="Search for a featured model...").change(
+                    filter_models, inputs="__self__", outputs="model"
+                ),
+                gr.Radio(label="Select a model below", value="meta-llama/Llama-3.3-70B-Instruct", choices=models_list, interactive=True, elem_id="model-radio")
+            ])
+        )
+    ],
+    fill_height=True,
+    chatbot=chatbot,
+    theme="Nymbo/Nymbo_Theme",
+)
+# Adding an "Information" tab with accordions for "Featured Models" and "Parameters Overview"
+with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
+    with gr.Tab("Chat"):
+        gr.Markdown("## Chat with the Model")
+        chatbot.render()
     with gr.Tab("Information"):
         with gr.Accordion("Featured Models", open=False):
+            gr.HTML(
                 """
+                <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending">See all available models</a></p>
                 <table style="width:100%; text-align:center; margin:auto;">
                     <tr>
                         <th>Model Name</th>
+                        <th>Type</th>
+                        <th>Notes</th>
                     </tr>
                     <tr>
+                        <td>Llama-3.3-70B-Instruct</td>
+                        <td>Instruction</td>
+                        <td>High performance</td>
                     </tr>
                     <tr>
+                        <td>Llama-2-70B-chat</td>
+                        <td>Chat</td>
+                        <td>Conversational</td>
                     </tr>
                     <tr>
+                        <td>Flan-T5-XL</td>
+                        <td>General</td>
+                        <td>Versatile</td>
                     </tr>
                 </table>
                 """
         with gr.Accordion("Parameters Overview", open=False):
             gr.Markdown(
                 """
+                ## Parameters Overview
+                ### Max new tokens
+                This slider controls the maximum number of tokens to generate in the response.
+                ### Temperature
+                Sampling temperature, which controls the randomness. A higher temperature makes the output more random.
+                ### Top-P
+                Top-p (nucleus) sampling, which controls the diversity. The model considers the smallest number of tokens whose cumulative probability exceeds the top-p threshold.
+                ### Frequency Penalty
+                Penalizes repeated tokens in the output, which helps to reduce repetition.
+                ### Seed
+                A fixed seed for reproducibility. Set to -1 for random seed.
                 """
             )
+print("Launching the demo application.")
+demo.launch()