File size: 10,671 Bytes
038f313
4c18bfc
77298b9
 
 
 
 
038f313
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
77298b9
038f313
77298b9
 
69b4a5f
038f313
 
 
3a64d68
98674ca
77298b9
 
038f313
e13eb1b
52ad57a
77298b9
 
52ad57a
10ffb1d
 
 
 
 
77298b9
 
e13eb1b
10ffb1d
77298b9
 
86297f5
77298b9
 
 
 
f7c4208
52ad57a
 
 
038f313
77298b9
 
 
 
 
e7683ca
77298b9
 
e7683ca
77298b9
e7683ca
77298b9
 
 
10ffb1d
77298b9
 
 
 
 
 
 
 
 
f7c4208
77298b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542c2ac
e13eb1b
f7c4208
77298b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7683ca
77298b9
 
 
 
 
 
e7683ca
8696822
77298b9
 
 
 
 
 
 
 
 
 
 
 
 
10ffb1d
 
e7683ca
77298b9
 
 
 
 
e7683ca
10ffb1d
e7683ca
77298b9
 
 
 
 
e7683ca
 
77298b9
e7683ca
77298b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10ffb1d
e7683ca
77298b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10ffb1d
e7683ca
77298b9
 
 
 
 
e7683ca
10ffb1d
77298b9
 
 
 
 
 
 
 
 
10ffb1d
77298b9
 
 
 
 
 
 
 
e7683ca
77298b9
 
 
 
e7683ca
77298b9
e7683ca
 
 
 
 
 
77298b9
 
e7683ca
77298b9
 
 
e7683ca
10ffb1d
77298b9
 
 
 
 
 
 
 
e7683ca
77298b9
e7683ca
 
 
 
 
 
77298b9
 
10ffb1d
77298b9
 
10ffb1d
 
77298b9
 
 
 
 
 
e7683ca
 
77298b9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import gradio as gr
import os
from openai import OpenAI

################################################
#                INITIAL SETUP
################################################

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

# Our main response-generating function
def respond(
    user_message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    featured_model,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - user_message: the user's new message
    - history: the list of previous messages, each as [user_text, assistant_text]
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - featured_model: the user-chosen model from the radio button
    - custom_model: a user-specified custom model that overrides featured_model if not empty
    """

    print(f"New user message: {user_message}")
    print(f"History so far: {history}")
    print(f"System message: {system_message}")
    print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}")
    print(f"frequency_penalty: {frequency_penalty}, seed: {seed}")
    print(f"Featured Model: {featured_model}")
    print(f"Custom Model: {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Determine which model to use
    # If the user typed something in custom_model, that overrides the featured model
    # Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..."
    model_to_use = None
    if custom_model.strip():
        model_to_use = custom_model.strip()
    elif featured_model is not None and featured_model.strip():
        model_to_use = featured_model.strip()
    else:
        model_to_use = "meta-llama/Llama-3.3-70B-Instruct"

    print(f"Model selected for inference: {model_to_use}")

    # Construct the conversation messages for the HF Inference API
    messages = [{"role": "system", "content": system_message}]
    for user_text, assistant_text in history:
        if user_text:
            messages.append({"role": "user", "content": user_text})
        if assistant_text:
            messages.append({"role": "assistant", "content": assistant_text})
    messages.append({"role": "user", "content": user_message})

    # We'll collect and stream the response
    response_so_far = ""

    # Make the streaming request to the HF Inference API
    print("Sending request to OpenAI/Hugging Face Inference API...")
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # The content for the partial chunk
        token_text = message_chunk.choices[0].delta.content
        response_so_far += token_text
        # Return partial response to Gradio to display in real-time
        yield response_so_far

    print("Completed response generation.")

################################################
#          GRADIO UI + STATE MANAGEMENT
################################################

def user_submit(user_message, history):
    """
    This function is called when the user sends a message.
    We simply add the user message to the conversation history.
    """
    print("user_submit triggered.")
    # Append the new user message to history
    if not history:
        history = []
    history = history + [[user_message, None]]
    return history, ""

def bot_reply(history, system_message, max_tokens, temperature, top_p,
              frequency_penalty, seed, featured_model, custom_model):
    """
    This function is triggered to produce the bot's response after the user has submitted.
    We call 'respond' for streaming text.
    """
    print("bot_reply triggered.")

    # The last conversation item has user_message, None
    user_message = history[-1][0]

    # We will stream the partial responses from 'respond'
    bot_response = respond(
        user_message=user_message,
        history=history[:-1],  # all items except the last, because we pass the last user msg separately
        system_message=system_message,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        featured_model=featured_model,
        custom_model=custom_model
    )

    # As we yield from the generator, we update the last item in history with the partial response
    # Gradio streaming logic: yield the partial updates as they come in
    for partial_text in bot_response:
        history[-1][1] = partial_text
        yield history

# We define a small list of placeholder featured models for demonstration
models_list = [
    "meta-llama/Llama-2-13B-Chat-hf",
    "bigscience/bloom",
    "EleutherAI/gpt-neo-2.7B",
    "meta-llama/Llama-3.3-70B-Instruct"
]

def filter_models(search_term):
    """
    Filter function triggered when user types in the model_search box.
    Returns an updated list of models that contain the search term.
    """
    filtered = [m for m in models_list if search_term.lower() in m.lower()]
    return gr.update(choices=filtered)


################################################
#        BUILDING THE GRADIO LAYOUT
################################################

with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    gr.Markdown(
        """
        # Serverless-TextGen-Hub
        **A UI for text generation using Hugging Face's Inference API.**
        
        Below is a simple chat interface. You can pick from **Featured Models** or specify a **Custom Model** 
        to override the choice. If you're not sure, just use the default.
        """
    )

    # State to hold the conversation history, will be a list of [user, bot]
    conversation_state = gr.State([])

    # Row for system message + advanced settings
    with gr.Accordion("Advanced Settings", open=False):
        system_message = gr.Textbox(
            label="System Message",
            value="You are a helpful assistant.",
            lines=2,
            info="Provides background or personality instructions to the model."
        )
        max_tokens = gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens"
        )
        temperature = gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P"
        )
        frequency_penalty = gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty"
        )
        seed = gr.Slider(
            minimum=-1,
            maximum=65535,
            value=-1,
            step=1,
            label="Seed (-1 for random)"
        )

    # Featured Models + filtering
    with gr.Accordion("Featured Models", open=False):
        model_search = gr.Textbox(
            label="Filter Models",
            placeholder="Search for a featured model...",
            lines=1
        )
        featured_model_radio = gr.Radio(
            label="Select a featured model below",
            choices=models_list,
            value=models_list[0],  # default selection
            interactive=True
        )
        model_search.change(
            filter_models,
            inputs=model_search,
            outputs=featured_model_radio
        )

    # This is the Custom Model box (overrides Featured Models if not empty)
    custom_model = gr.Textbox(
        label="Custom Model",
        value="",
        info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model."
    )

    # The main Chatbot interface
    chatbot = gr.Chatbot(height=600)

    # Textbox for the user to type a new message
    with gr.Row():
        user_input = gr.Textbox(
            show_label=False,
            placeholder="Type your message here (press enter or click 'Submit')",
            lines=2
        )
        submit_btn = gr.Button("Submit", variant="primary")

    # The user submits -> we update the conversation state
    submit_btn.click(
        fn=user_submit,
        inputs=[user_input, conversation_state],
        outputs=[conversation_state, user_input],
    )

    # Then the bot replies, streaming the output
    # We pass all required arguments from the advanced settings, plus the model selection boxes
    submit_btn.click(
        fn=bot_reply,
        inputs=[
            conversation_state,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            featured_model_radio,
            custom_model
        ],
        outputs=[chatbot],
        # 'bot_reply' is a generator, so we set streaming=True:
        queue=True
    )

    # We also allow pressing Enter in user_input to do the same thing
    user_input.submit(
        fn=user_submit,
        inputs=[user_input, conversation_state],
        outputs=[conversation_state, user_input],
    )
    user_input.submit(
        fn=bot_reply,
        inputs=[
            conversation_state,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            featured_model_radio,
            custom_model
        ],
        outputs=[chatbot],
        queue=True
    )

    gr.HTML("""
    <br>
    <p style='text-align:center;'>
        Developed by <strong>Nymbo</strong>. 
        Powered by <strong>Hugging Face Inference API</strong>.
    </p>
    """)

# Finally, launch the app
if __name__ == "__main__":
    print("Launching the Serverless-TextGen-Hub application...")
    demo.launch()