VinitT's picture
Update app.py
f076978 verified
import os
import streamlit as st
from huggingface_hub import login
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
# Step 1: Log in to Hugging Face with your access token from secrets
huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # Fetch the token from environment
if huggingface_token:
login(token=huggingface_token) # Authenticate using the token
else:
st.error("Hugging Face token not found. Please set it in the Secrets section.")
# Step 2: Load the model and processor
try:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
model_name,
token=huggingface_token,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(
model_name,
use_auth_token=huggingface_token,
)
st.success("Model and processor loaded successfully!")
except Exception as e:
st.error(f"Error loading model or processor: {str(e)}")
# Step 3: Create a simple Streamlit app
def main():
st.title("Llama 3.2 11B Vision Model")
st.write("Upload an image and enter a prompt to generate output.")
# Upload image
image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
prompt = st.text_area("Enter your prompt here:")
if st.button("Generate Output"):
if image_file and prompt:
# Load image
image = Image.open(image_file).convert("RGB")
st.image(image, caption="Uploaded Image", use_column_width=True)
try:
# Prepare the messages in the format expected by the processor
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image"}
]
}
]
# Apply chat template
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
# Prepare inputs for the model
inputs = processor(
text=input_text,
images=[image],
return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")
# Generate output
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=250,
)
# Decode the output
output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
# Extract the generated response
# Remove the prompt part from the output_text
if input_text in output_text:
generated_output = output_text.replace(input_text, "").strip()
else:
generated_output = output_text.strip()
st.write("Generated Output:", generated_output)
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
else:
st.warning("Please upload an image and enter a prompt.")
if __name__ == "__main__":
main()