How to run this model on cpu on Windows 10
#3
by
brand17
- opened
I have no GPU but I can run this model from ollama. It works with speed of ~1 t/s.
But it doesn't work when I try the following code:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
GenerationConfig,
)
import torch
new_model = "openbuddy/openbuddy-llama3-8b-v21.1-8k"
model = AutoModelForCausalLM.from_pretrained(
new_model,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained(
new_model,
max_length=2048,
trust_remote_code=True,
use_fast=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
prompt = """<|im_start|>system
You are a helpful AI assistant.<|im_end|>
<|im_start|>user
Как открыть брокерский счет?<|im_end|>
<|im_start|>assistant
"""
inputs = tokenizer.encode(
prompt, return_tensors="pt", add_special_tokens=False
).cpu()
generation_config = GenerationConfig(
max_new_tokens=700,
temperature=0.5,
top_p=0.9,
top_k=40,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
outputs = model.generate(
generation_config=generation_config,
input_ids=inputs,
)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))
Looks like model.generate works much slower comparing to running from ollama.
I see that the process uses 25% of cpu only.
Where am I wrong?