how to run
how to run?
code?????? with device_map=true
colab t4
Use a pipeline as a high-level helper
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
pipe(messages)
Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
erorre
I have a good code snippet for llama model
def chat_llama3(message: str, context: str):
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
{"role": "system", "content": context},
{"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize = True,
add_generation_prompt = True, # Must add for generation
return_tensors = "pt",
).to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
#_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)
output = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract the assistant's message
user_marker = "user"
assistant_marker = "assistant"
response_start = generated_text.find(assistant_marker) + len(assistant_marker)
response_end = generated_text.find(user_marker, response_start)
if response_end == -1:
response = generated_text[response_start:].strip()
else:
response = generated_text[response_start:response_end].strip()
return response
Have you tried this code nemotron?
Nemotron 4bit not run on colab with devicemap auto or airllm
It's a LLaMA 3.1 base model, so I think it should work.
colab t4
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_name = "unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit"
إعداد تكوين الكم
quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_fp16=True)
تحميل المحول
tokenizer = AutoTokenizer.from_pretrained(model_name)
تحميل النموذج مع دعم الكم
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quant_config, # استخدم هذا إذا كنت ترغب في توزيع النموذج تلقائيًا على CPU وGPU
)
def chat_llama3(message: str, context: str):
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
{"role": "system", "content": context},
{"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
output = model.generate(input_ids=inputs, max_new_tokens=1024, use_cache=True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract the assistant's message
user_marker = "user"
assistant_marker = "assistant"
response_start = generated_text.find(assistant_marker) + len(assistant_marker)
response_end = generated_text.find(user_marker, response_start)
if response_end == -1:
response = generated_text[response_start:].strip()
else:
response = generated_text[response_start:response_end].strip()
return response
context = "أنت مساعد ذكي هنا للمساعدة."
message = "ما هي عاصمة فرنسا؟"
response = chat_llama3(message, context)
print(response)
Unused kwargs: ['bnb_4bit_use_fp16']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
low_cpu_mem_usage
was None, now set to True since model is quantized.
Loading checkpoint shards: 38%
3/8 [01:10<01:49, 21.99s/it]
OutOfMemoryError Traceback (most recent call last)
in <cell line: 12>()
10
11 # تحميل النموذج مع دعم الكم
---> 12 model = AutoModelForCausalLM.from_pretrained(
13 model_name,
14 quantization_config=quant_config, # استخدم هذا إذا كنت ترغب في توزيع النموذج تلقائيًا على CPU وGPU
5 frames
/usr/local/lib/python3.10/dist-packages/bitsandbytes/nn/modules.py in from_prequantized(cls, data, quantized_stats, requires_grad, device, module, **kwargs)
277 **kwargs,
278 ) -> "Params4bit":
--> 279 self = torch.Tensor._make_subclass(cls, data.to(device))
280 self.requires_grad = requires_grad
281 self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 75.06 MiB is free. Process 103552 has 14.67 GiB memory in use. Of the allocated memory 14.52 GiB is allocated by PyTorch, and 59.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit", device_map="auto")
fix it
Do you know Python code for inference without training with unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit model?
How to use
messages = [
{"role": "system", "content": context},
{"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
output = model.generate(input_ids=inputs, max_new_tokens=1024, use_cache=True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
Extract the assistant's message
user_marker = "user"
assistant_marker = "assistant"
response_start = generated_text.find(assistant_marker) + len(assistant_marker)
response_end = generated_text.find(user_marker, response_start)
if response_end == -1:
response = generated_text[response_start:].strip()
else:
response = generated_text[response_start:response_end].strip()
return response