File size: 9,587 Bytes
038f313
7de1759
fab24df
038f313
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
 
27c8b8d
 
 
038f313
 
 
3a64d68
98674ca
fab24df
 
038f313
e13eb1b
7255410
 
 
 
 
 
 
 
 
fab24df
 
e13eb1b
7255410
27c8b8d
 
 
 
 
7de1759
fab24df
f7c4208
fab24df
52ad57a
 
038f313
fab24df
 
27c8b8d
fab24df
27c8b8d
 
 
 
 
 
 
 
 
 
fab24df
27c8b8d
 
fab24df
 
 
 
 
 
77298b9
 
fab24df
27c8b8d
 
 
fab24df
27c8b8d
fab24df
27c8b8d
fab24df
27c8b8d
 
 
 
 
 
 
 
 
 
fab24df
27c8b8d
542c2ac
e13eb1b
f7c4208
fab24df
 
 
 
 
 
 
 
 
 
 
 
 
27c8b8d
e7683ca
fab24df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77298b9
27c8b8d
77298b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import gradio as gr
import os
from openai import OpenAI

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model,
    featured_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: a user-provided custom model name (if any)
    - featured_model: the user-selected model from the radio
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Custom model: {custom_model}")
    print(f"Featured model: {featured_model}")

    # Convert seed to None if -1 (meaning "random")
    if seed == -1:
        seed = None

    # Construct the conversation array required by the HF Inference API
    messages = [{"role": "system", "content": system_message or ""}]

    # Add conversation history
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # The latest user message
    messages.append({"role": "user", "content": message})

    # If custom_model is not empty, it overrides the featured model
    model_to_use = custom_model.strip() if custom_model.strip() != "" else featured_model.strip()
    # If somehow both are empty, default to an example model
    if model_to_use == "":
        model_to_use = "meta-llama/Llama-3.3-70B-Instruct"

    print(f"Model selected for inference: {model_to_use}")

    # Build the response from the streaming tokens
    response = ""
    print("Sending request to OpenAI API.")

    # Streaming request to the HF Inference API
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        # Yield partial response so Gradio can display in real-time
        yield response

    print("Completed response generation.")

#
# Building the Gradio interface below
#
print("Building the Gradio interface with advanced features...")

# --- Create a list of 'Featured Models' for demonstration. You can customize as you wish. ---
models_list = (
    "meta-llama/Llama-3.3-70B-Instruct",
    "BigScience/bloom",
    "openai/gpt-4",
    "google/flan-t5-xxl",
    "EleutherAI/gpt-j-6B",
    "YourSpecialModel/awesome-13B",
)

# This function filters the above models_list by a given search term:
def filter_models(search_term):
    filtered = [m for m in models_list if search_term.lower() in m.lower()]
    return gr.update(choices=filtered)

# We’ll create a Chatbot in a Blocks layout to incorporate an Accordion for "Featured Models"
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    gr.Markdown("## Serverless-TextGen-Hub\nA comprehensive UI for text generation, including featured models and custom model overrides.")

    # The Chatbot itself
    chatbot = gr.Chatbot(label="TextGen Chatbot", height=600)

    with gr.Row():
        with gr.Column(scale=1):
            # We create interactive UI elements that will feed into the 'respond' function

            # System message
            system_message = gr.Textbox(label="System message", placeholder="Set the system role instructions here.")

            # Accordion for selecting the model
            with gr.Accordion("Featured Models", open=True):
                model_search = gr.Textbox(
                    label="Filter Models",
                    placeholder="Search for a featured model...",
                    lines=1
                )
                featured_model = gr.Radio(
                    label="Select a Featured Model Below",
                    choices=models_list,
                    value="meta-llama/Llama-3.3-70B-Instruct",  # default
                    interactive=True,
                )
                # Link the search box to filter the radio model choices
                model_search.change(filter_models, inputs=model_search, outputs=featured_model)

            # A text box to optionally override the featured model
            custom_model = gr.Textbox(
                label="Custom Model",
                info="(Optional) Provide a custom HF model path. If non-empty, it overrides your featured model choice."
            )

            # Sliders
            max_tokens = gr.Slider(
                minimum=1,
                maximum=4096,
                value=512,
                step=1,
                label="Max new tokens"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=4.0,
                value=0.7,
                step=0.1,
                label="Temperature"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-P"
            )
            frequency_penalty = gr.Slider(
                minimum=-2.0,
                maximum=2.0,
                value=0.0,
                step=0.1,
                label="Frequency Penalty"
            )
            seed = gr.Slider(
                minimum=-1,
                maximum=65535,
                value=-1,
                step=1,
                label="Seed (-1 for random)"
            )

        # The "chat" Column
        with gr.Column(scale=2):
            # We store the conversation history in a state variable
            state = gr.State([])  # Each element in state is (user_message, assistant_message)

            # Chat input box for the user
            with gr.Row():
                txt = gr.Textbox(
                    label="Enter your message",
                    placeholder="Type your request here, then press 'Submit'",
                    lines=3
                )

            # Button to submit the message
            submit_btn = gr.Button("Submit", variant="primary")

    #
    # The 'respond' function is tied to the chatbot display.
    # We'll define a small wrapper that updates the 'history' (state) each time.
    #

    def user_submit(user_message, chat_history):
        """
        This function just adds the user message to the history and returns it.
        The actual text generation will come from 'bot_respond' next.
        """
        # Append new user message to the existing conversation
        chat_history = chat_history + [(user_message, None)]
        return "", chat_history

    def bot_respond(chat_history, sys_msg, max_t, temp, top, freq_pen, s, custom_mod, feat_model):
        """
        This function calls our 'respond' generator to get the text.
        It updates the last message in chat_history with the bot's response as it streams.
        """
        user_message = chat_history[-1][0] if len(chat_history) > 0 else ""
        # We call the generator
        bot_messages = respond(
            user_message,
            chat_history[:-1],     # all but the last user message
            sys_msg,
            max_t,
            temp,
            top,
            freq_pen,
            s,
            custom_mod,
            feat_model,
        )

        # Stream the tokens back
        final_bot_msg = ""
        for token_text in bot_messages:
            final_bot_msg = token_text
            # We'll update the chatbot in real-time
            chat_history[-1] = (user_message, final_bot_msg)
            yield chat_history

    # Tie the Submit button to the user_submit function, and then to bot_respond
    submit_btn.click(
        user_submit,
        inputs=[txt, state],
        outputs=[txt, state],
        queue=False
    ).then(
        bot_respond,
        inputs=[state, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, custom_model, featured_model],
        outputs=[chatbot],
        queue=True
    )

print("Interface construction complete. Ready to launch!")

# Launch the Gradio Blocks interface
if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()