Bielik-11B-v2.3
Collection
A collection of models based on Bielik-11B-v2.3 (merge of Bielik models) - instruct and quantized versions.
•
8 items
•
Updated
•
11
This repo contains OpenVino 4bit format model files for SpeakLeash's Bielik-11B-v.2.3-Instruct.
DISCLAIMER: Be aware that quantised models show reduced response quality and possible hallucinations!
This model can be deployed efficiently using the OpenVino. Below you can find two ways of model inference: using Intel Optimum, pure OpenVino library.
The most simple LLM inferencing code with OpenVINO and the optimum-intel library.
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
model_id = "speakleash/Bielik-11B-v2.3-Instruct-4bit-ov"
model = OVModelForCausalLM.from_pretrained(model_id, use_cache=False)
question = "Dlaczego ryby nie potrafią fruwać?"
prompt_text_bielik = f"""<s><|im_start|> system
Odpowiadaj krótko, precyzyjnie i wyłącznie w języku polskim.<|im_end|>
<|im_start|> user
{question}<|im_end|>
<|im_start|> assistant
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(prompt_text_bielik, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Run an LLM model with only OpenVINO (additionaly we provided code which uses 'greedy decoding' instead of sampling).
import openvino as ov
import numpy as np
from transformers import AutoTokenizer
model_path = "speakleash/Bielik-11B-v2.3-Instruct-4bit-ov/openvino_model.xml"
tokenizer = AutoTokenizer.from_pretrained("speakleash/Bielik-11B-v2.3-Instruct-4bit-ov")
ov_model = ov.Core().read_model(model_path)
compiled_model = ov.compile_model(ov_model, "CPU")
infer_request = compiled_model.create_infer_request()
question = "Dlaczego ryby nie potrafią fruwać?"
prompt_text_bielik = f"""<s><|im_start|> system
Odpowiadaj krótko, precyzyjnie i wyłącznie w języku polskim.<|im_end|>
<|im_start|> user
{question}<|im_end|>
<|im_start|> assistant
"""
tokens = tokenizer.encode(prompt_text_bielik, return_tensors="np")
input_ids = tokens
attention_mask = np.ones_like(input_ids)
position_ids = np.arange(len(tokens[0])).reshape(1, -1)
beam_idx = np.array([0], dtype=np.int32)
infer_request.reset_state()
prev_output = ''
generated_text_ids = np.array([], dtype=np.int32)
num_max_token_for_generation = 500
print(f'Pytanie: {question}')
print("Odpowiedź:", end=' ', flush=True)
for _ in range(num_max_token_for_generation):
response = infer_request.infer(inputs={
'input_ids': input_ids,
'attention_mask': attention_mask,
'position_ids': position_ids,
'beam_idx': beam_idx
})
next_token_logits = response['logits'][0, -1, :]
sampled_id = np.argmax(next_token_logits) # Greedy decoding
generated_text_ids = np.append(generated_text_ids, sampled_id)
output_text = tokenizer.decode(generated_text_ids)
print(output_text[len(prev_output):], end='', flush=True)
prev_output = output_text
input_ids = np.array([[sampled_id]], dtype=np.int64)
attention_mask = np.array([[1]], dtype=np.int64)
position_ids = np.array([[position_ids[0, -1] + 1]], dtype=np.int64)
if sampled_id == tokenizer.eos_token_id:
print('\n\n*** Zakończono generowanie.')
break
print(f'\n\n*** Wygenerowano {len(generated_text_ids)} tokenów.')
If you have any questions or suggestions, please use the discussion tab. If you want to contact us directly, join our Discord SpeakLeash.
Base model
speakleash/Bielik-11B-v2.3-Instruct