|
import time |
|
import random |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
class EndpointHandler: |
|
""" |
|
Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`. |
|
""" |
|
def __init__(self, path=""): |
|
""" |
|
Initialize model and tokenizer. |
|
:param path: Path to model and tokenizer |
|
""" |
|
self.tokenizer = AutoTokenizer.from_pretrained(path) |
|
self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype="auto", device_map="auto") |
|
|
|
def __call__(self, data: dict): |
|
""" |
|
Execute model based on input data. |
|
|
|
:param data: Input parameters for the model. |
|
Should be in the following form: |
|
`{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}` |
|
|
|
:return: dict (answer, num_new_token, speed) |
|
""" |
|
|
|
question = data.get("inputs", None) |
|
max_new_tokens = data.get("max_new_tokens", 1024) |
|
parameters = data.get("parameters", {}) |
|
|
|
if not question: |
|
raise ValueError("Input prompt is missing.") |
|
|
|
messages = [ |
|
{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."}, |
|
{"role": "user", "content": question + " Then, give your confidence level in percentage regarding your answer."} |
|
] |
|
|
|
text = self.tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
model_inputs = self.tokenizer([text], return_tensors="pt").to("cuda") |
|
|
|
torch.manual_seed(random.randint(0, 2 ** 32 - 1)) |
|
|
|
time_start = time.time() |
|
generated_ids = self.model.generate( |
|
**model_inputs, |
|
max_new_tokens=max_new_tokens, |
|
temperature=1.0, |
|
do_sample=True, |
|
top_p=0.9, |
|
**parameters |
|
) |
|
time_end = time.time() |
|
|
|
num_new_tokens = len(generated_ids[0]) - len(model_inputs[0]) |
|
|
|
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in |
|
zip(model_inputs.input_ids, generated_ids)] |
|
|
|
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
speed = num_new_tokens / (time_end - time_start) |
|
|
|
return { |
|
"answer": response, |
|
"num_new_tokens": num_new_tokens, |
|
"speed": speed |
|
} |
|
|
|
|