how to run

#1
by sdyy - opened

how to run?
code?????? with device_map=true
colab t4

Use a pipeline as a high-level helper

from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
pipe(messages)

Load model directly

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")

erorre

I have a good code snippet for llama model

def chat_llama3(message: str, context: str):
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    
    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": message},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")
    
    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer)
    #_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)
    output = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract the assistant's message
    user_marker = "user"
    assistant_marker = "assistant"
    
    response_start = generated_text.find(assistant_marker) + len(assistant_marker)
    response_end = generated_text.find(user_marker, response_start)
    
    if response_end == -1:
        response = generated_text[response_start:].strip()
    else:
        response = generated_text[response_start:response_end].strip()
    
    return response

Have you tried this code nemotron?

Nemotron 4bit not run on colab with devicemap auto or airllm

It's a LLaMA 3.1 base model, so I think it should work.

colab t4

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit"

إعداد تكوين الكم

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_fp16=True)

تحميل المحول

tokenizer = AutoTokenizer.from_pretrained(model_name)

تحميل النموذج مع دعم الكم

model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quant_config, # استخدم هذا إذا كنت ترغب في توزيع النموذج تلقائيًا على CPU وGPU
)
def chat_llama3(message: str, context: str):
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "system", "content": context},
    {"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

output = model.generate(input_ids=inputs, max_new_tokens=1024, use_cache=True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Extract the assistant's message
user_marker = "user"
assistant_marker = "assistant"

response_start = generated_text.find(assistant_marker) + len(assistant_marker)
response_end = generated_text.find(user_marker, response_start)

if response_end == -1:
    response = generated_text[response_start:].strip()
else:
    response = generated_text[response_start:response_end].strip()

return response

context = "أنت مساعد ذكي هنا للمساعدة."
message = "ما هي عاصمة فرنسا؟"
response = chat_llama3(message, context)
print(response)

Unused kwargs: ['bnb_4bit_use_fp16']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
low_cpu_mem_usage was None, now set to True since model is quantized.
Loading checkpoint shards:  38%
 3/8 [01:10<01:49, 21.99s/it]

OutOfMemoryError Traceback (most recent call last)
in <cell line: 12>()
10
11 # تحميل النموذج مع دعم الكم
---> 12 model = AutoModelForCausalLM.from_pretrained(
13 model_name,
14 quantization_config=quant_config, # استخدم هذا إذا كنت ترغب في توزيع النموذج تلقائيًا على CPU وGPU

5 frames
/usr/local/lib/python3.10/dist-packages/bitsandbytes/nn/modules.py in from_prequantized(cls, data, quantized_stats, requires_grad, device, module, **kwargs)
277 **kwargs,
278 ) -> "Params4bit":
--> 279 self = torch.Tensor._make_subclass(cls, data.to(device))
280 self.requires_grad = requires_grad
281 self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 75.06 MiB is free. Process 103552 has 14.67 GiB memory in use. Of the allocated memory 14.52 GiB is allocated by PyTorch, and 59.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Load model directly

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit", device_map="auto")
fix it

Do you know Python code for inference without training with unsloth/Llama-3.1-Nemotron-70B-Instruct-bnb-4bit model?

How to use
messages = [
{"role": "system", "content": context},
{"role": "user", "content": message},
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

output = model.generate(input_ids=inputs, max_new_tokens=1024, use_cache=True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

Extract the assistant's message

user_marker = "user"
assistant_marker = "assistant"

response_start = generated_text.find(assistant_marker) + len(assistant_marker)
response_end = generated_text.find(user_marker, response_start)

if response_end == -1:
response = generated_text[response_start:].strip()
else:
response = generated_text[response_start:response_end].strip()

return response

Sign up or log in to comment