cym2784841400
/

Qwen2.5-Math-7B-Instruct-Custom-Handler

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

Qwen2.5-Math-7B-Instruct-Custom-Handler / handler.py

cyqm

Update handler

094ab01 about 2 months ago

history blame contribute delete

2.46 kB

	import time
	import random
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	class EndpointHandler:
	"""
	Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`.
	"""
	def __init__(self, path=""):
	"""
	Initialize model and tokenizer.
	:param path: Path to model and tokenizer
	"""
	self.tokenizer = AutoTokenizer.from_pretrained(path)
	self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype="auto", device_map="auto")

	def __call__(self, data: dict):
	"""
	Execute model based on input data.

	:param data: Input parameters for the model.
	Should be in the following form:
	`{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}`

	:return: dict (answer, num_new_token, speed)
	"""

	question = data.get("inputs", None)
	max_new_tokens = data.get("max_new_tokens", 1024)
	parameters = data.get("parameters", {})

	if not question:
	raise ValueError("Input prompt is missing.")

	messages = [
	{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
	{"role": "user", "content": question + " Then, give your confidence level in percentage regarding your answer."}
	]

	text = self.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	model_inputs = self.tokenizer([text], return_tensors="pt").to("cuda")

	torch.manual_seed(random.randint(0, 2 ** 32 - 1))

	time_start = time.time()
	generated_ids = self.model.generate(
	**model_inputs,
	max_new_tokens=max_new_tokens,
	temperature=1.0,
	do_sample=True,
	top_p=0.9,
	**parameters
	)
	time_end = time.time()

	num_new_tokens = len(generated_ids[0]) - len(model_inputs[0])

	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in
	zip(model_inputs.input_ids, generated_ids)]

	response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

	speed = num_new_tokens / (time_end - time_start)

	return {
	"answer": response,
	"num_new_tokens": num_new_tokens,
	"speed": speed
	}