Spaces:

usag1e
/

my-llm-endpoint-fresh

Runtime error

File size: 1,255 Bytes

8e1f74f
a931f78
8e1f74f
 
32777f1
db3d08a
 
8e1f74f
2cb7578
db3d08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cb7578
 
db3d08a
8e1f74f
a931f78
c9f6dd3
 
8e1f74f

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Model configuration
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Base"  # Hugging Face model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        revision="main"
    ).to(device)
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# FastAPI app initialization
app = FastAPI()

# Input schema
class Query(BaseModel):
    input_text: str

@app.post("/predict")
async def predict(query: Query):
    input_text = query.input_text
    if not input_text:
        raise HTTPException(status_code=400, detail="Input text cannot be empty.")
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(inputs["input_ids"], max_new_tokens=50, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"response": response}