Using Llama offline with transformers is slow and response is repeating itself
#139
by
AyoxRay
- opened
Hi
I tried using Llama 3 offline (https://huggingface.co./docs/transformers/installation#fetch-models-and-tokenizers-to-use-offline)
with the following transformers:
- Transformers Pipeline (https://huggingface.co./meta-llama/Meta-Llama-3-8B-Instruct#transformers-pipeline)
- Transformers AutoModelFor CausalLm (https://huggingface.co./meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm)
in a streamlit chat.
But when using "pipeline()" or "model.generate()", it runs for almost an hour and the result to the prompt "hi how are you?" is :
<|im_start|>assistant
hi how are you?<|im_end|>
<|im_start|>user
hi how are you?<|im_end|>
<|im_start|>system
hi how are you?<|im_end|> (repeating)
Code:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
model_id = "Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
st.title("ChatGPT-like clone")
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
)
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
def llama0(prompt, context):
messages = [
{"role": "system", "content": context},
{"role": "user", "content": prompt},
]
terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
outputs = pipeline(
messages,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
return outputs[0]["generated_text"][-1]
def llama1(prompt, context):
messages = [
{"role": "system", "content": context},
{"role": "user", "content": prompt}
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
output = model.generate(
input_ids,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
response = output[0][input_ids.shape[-1]:]
# response1 = tokenizer.decode(response, skip_special_tokens=True)
return response
if prompt := st.chat_input("What is up?"):
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
st.markdown(llama1(prompt, prompt))
Can someone help me?