Spaces:

VinitT
/

Llama-3.2-11B-Vision-Instruct

Running

App Files Files Community

Llama-3.2-11B-Vision-Instruct / app.py

VinitT

Update app.py

f076978 verified 3 months ago

raw

history blame contribute delete

3.37 kB

	import os
	import streamlit as st
	from huggingface_hub import login
	from transformers import MllamaForConditionalGeneration, AutoProcessor
	from PIL import Image
	import torch

	# Step 1: Log in to Hugging Face with your access token from secrets
	huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # Fetch the token from environment
	if huggingface_token:
	login(token=huggingface_token) # Authenticate using the token
	else:
	st.error("Hugging Face token not found. Please set it in the Secrets section.")

	# Step 2: Load the model and processor
	try:
	model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
	model = MllamaForConditionalGeneration.from_pretrained(
	model_name,
	token=huggingface_token,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(
	model_name,
	use_auth_token=huggingface_token,
	)
	st.success("Model and processor loaded successfully!")
	except Exception as e:
	st.error(f"Error loading model or processor: {str(e)}")

	# Step 3: Create a simple Streamlit app
	def main():
	st.title("Llama 3.2 11B Vision Model")
	st.write("Upload an image and enter a prompt to generate output.")

	# Upload image
	image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
	prompt = st.text_area("Enter your prompt here:")

	if st.button("Generate Output"):
	if image_file and prompt:
	# Load image
	image = Image.open(image_file).convert("RGB")
	st.image(image, caption="Uploaded Image", use_column_width=True)

	try:
	# Prepare the messages in the format expected by the processor
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image"}
	]
	}
	]

	# Apply chat template
	input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

	# Prepare inputs for the model
	inputs = processor(
	text=input_text,
	images=[image],
	return_tensors="pt"
	).to("cuda" if torch.cuda.is_available() else "cpu")

	# Generate output
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=250,
	)

	# Decode the output
	output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

	# Extract the generated response
	# Remove the prompt part from the output_text
	if input_text in output_text:
	generated_output = output_text.replace(input_text, "").strip()
	else:
	generated_output = output_text.strip()

	st.write("Generated Output:", generated_output)
	except Exception as e:
	st.error(f"Error during prediction: {str(e)}")
	else:
	st.warning("Please upload an image and enter a prompt.")

	if __name__ == "__main__":
	main()