File size: 5,286 Bytes
c625a8c ed1e95d c625a8c 1dfd50d 654eaa0 30b9c64 654eaa0 f88f764 b94326e b2c95c6 6a34b4c c9a5ded 0bb3f90 24c22bc c9a5ded ac4ae29 dd05653 6a34b4c d1f386f 609ebbf aad9e06 b9c177c cefc820 225be53 71778ca cefc820 0bb3f90 609ebbf e0024f6 f8392b7 d9ff7d3 609ebbf 5e9f4b2 609ebbf 71778ca 609ebbf c9a5ded 609ebbf 3ace823 09f4627 654eaa0 609ebbf ef7bf1f 682ac66 213eaca 654eaa0 0bb3f90 c625a8c 40d7f6a 1ede826 ef7bf1f 1ede826 1182d2f c625a8c 051e53e c625a8c f88f764 c625a8c 051e53e 43f6d46 0bb3f90 dbc2e41 185d061 0bb3f90 b16963c c9a5ded 941cbbb c625a8c 609ebbf a010ff1 3c58d3e ef6577b c625a8c 0d38122 886ba9c c625a8c 8766e00 886ba9c 1dfd50d 5b0eb6a c625a8c c9a5ded 1322444 051e53e 3523ac0 24c22bc aad9e06 96cc7ba aad9e06 1322444 72b576d de46666 96cc7ba a389a25 cb2b08b 602e5e4 1322444 c2e7cc6 1322444 1631980 1322444 a161c80 c2e7cc6 72b576d 602e5e4 1322444 7e33769 1322444 24c22bc 1322444 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import fastapi
from fastapi.responses import JSONResponse
from fastapi_users import schemas
from time import time
#from fastapi.middleware.cors import CORSMiddleware
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel
from fastapi import APIRouter
from app.users import current_active_user
#from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat")
model = ORTModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", export=True)
class GenModel(BaseModel):
question: str
system: str = "You are a helpful medical AI chat assistant. Help as much as you can.Also continuously ask for possible symptoms in order to atat a conclusive ailment or sickness and possible solutions.Remember, response in English."
temperature: float = 0.8
seed: int = 101
mirostat_mode: int=2
mirostat_tau: float=4.0
mirostat_eta: float=1.1
class ChatModel(BaseModel):
question: str
system: str = "You are chatDoctor, a helpful health and medical assistant. You are chatting with a human. Help as much as you can. Also continuously ask for possible symptoms in order to a conclusive ailment or sickness and possible solutions.Remember, response in English."
temperature: float = 0.8
seed: int = 101
mirostat_mode: int=2
mirostat_tau: float=4.0
mirostat_eta: float=1.1
llm_chat = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat"),
verbose=False,
n_ctx=512,
n_gpu_layers=0,
#chat_format="llama-2"
)
llm_generate = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
#tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat"),
verbose=False,
n_ctx=4096,
n_gpu_layers=0,
mirostat_mode=2,
mirostat_tau=4.0,
mirostat_eta=1.1,
#chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
#app = fastapi.FastAPI(
#title="OpenGenAI",
#description="Your Excellect AI Physician")
"""
app.add_middleware(
CORSMiddleware,
allow_origins = ["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
"""
llm_router = APIRouter(prefix="/llm")
@llm_router.get("/health", tags=["llm"])
def health():
return {"status": "ok"}
# Chat Completion API
@llm_router.post("/chat/", tags=["llm"])
async def chat(chatm:ChatModel):#, user: schemas.BaseUser = fastapi.Depends(current_active_user)):
onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer)
question = chatm.question,
context = chatm.system,
pred = onnx_qa(question, context)
print("pred")
return ""
"""
#chatm.system = chatm.system.format("")#user.email)
try:
st = time()
output = llm_chat.create_chat_completion(
messages = chatm.question,
temperature = chatm.temperature,
seed = chatm.seed,
#stream=True
)
print(output)
#print(output)
et = time()
output["time"] = et - st
#messages.append({'role': "assistant", "content": output['choices'][0]['message']['content']})
#print(messages)
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
"""
# Chat Completion API
@llm_router.post("/generate", tags=["llm"])
async def generate(gen:GenModel):#, user: schemas.BaseUser = fastapi.Depends(current_active_user)):
"""
gen.system = "You are an helpful medical AI assistant."
gen.temperature = 0.5
gen.seed = 42
try:
#st = time()
output = llm_generate.create_completion(
#messages=[
# {"role": "system", "content": gen.system},
# {"role": "user", "content": gen.question},
# ],
gen.question,
temperature = gen.temperature,
seed= gen.seed,
#chat_format="llama-2",
stream=True,
echo = True
)
for chunk in output:
delta = chunk['choices'][0]#['delta']
print(delta)
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
#print(chunk)
#et = time()
#output["time"] = et - st
#print(output)
except Exception as e:
logger.error(f"Error in /generate endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
"""
onnx_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
generate = onnx_gen(gen.question)
return generate
|