import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList import torch import subprocess # useradd -m -u 1000 user subprocess.run(['useradd', '-m', "-u", "1000", "user" ]) import torch._dynamo torch._dynamo.config.suppress_errors = True import os # import pwd # print("HERE will print PWD") # print(pwd.getpwuid(os.getuid())[0]) # os.system("nvidia-smi") # print("TORCH_CUDA", torch.cuda.is_available()) print("loading model") # Load the tokenizer and model repo_name = "nvidia/Hymba-1.5B-Instruct" # repo_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True) model = model.cuda().to(torch.bfloat16) print("model is loaded") # Chat with Hymba # prompt = input() prompt = "Who are you?" messages = [ {"role": "system", "content": "You are a helpful assistant."} ] messages.append({"role": "user", "content": prompt}) # Apply chat template tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda') stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="")]) print("generating prompt") outputs = model.generate( tokenized_chat, max_new_tokens=256, do_sample=False, temperature=0.7, use_cache=True, stopping_criteria=stopping_criteria ) input_length = tokenized_chat.shape[1] response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True) print(f"Model response: {response}") def greet(name): print(f"User: prompt") print(f"Model response: {response}") # return "Hello " + name + "!!" demo = gr.Interface(fn=greet, inputs="text", outputs="text") demo.launch()