File size: 2,406 Bytes
4667b5a
769b533
60faee4
6d0db7f
1d873a4
 
 
 
07c2cc6
1d873a4
 
3a75081
1d873a4
 
5fb8783
 
 
1d873a4
5fb8783
 
 
 
 
 
 
 
 
 
 
1d873a4
 
07c2cc6
1d873a4
6d0db7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d873a4
6d0db7f
 
 
3f81f1c
6d0db7f
3f81f1c
 
60faee4
 
1d873a4
60faee4
 
 
8314b3e
c72fd59
60faee4
1d873a4
3f81f1c
60faee4
 
1d873a4
60faee4
 
1d873a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import spaces
import threading
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

# Load the model and tokenizer locally
model_name = "kz919/QwQ-0.5B-Distilled-SFT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

# Define the function to handle chat responses
@spaces.GPU
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
    # Prepare the prompt by combining history and system messages
    msg = [
        {"role": "system", "content": system_message}
    ]
    for user_input, assistant_response in history:
        msg.extend(
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": assistant_response}
        )
    msg.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(
        msg,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")


    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Use a thread to run the generation in parallel
    generation_thread = threading.Thread(
        target=model.generate,
        kwargs=dict(
            inputs=inputs.input_ids,
            max_length=max_tokens,
            streamer=streamer,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        ),
    )
    generation_thread.start()

    # Stream the tokens as they are generated
    text_buffer = ""
    for new_text in streamer:
        text_buffer+=new_text
        yield text_buffer


# Create the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.", label="System message"),
        gr.Slider(minimum=1, maximum=16384, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ]
)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()