import subprocess # Install SentencePiece untuk keperluan translate bahasa Indonesia subprocess.run(["pip", "install", "sentencepiece"]) from PIL import Image # library untuk image import gradio as gr # library untuk tampilan interface di huggingface from transformers import BlipProcessor, BlipForConditionalGeneration,MarianTokenizer, MarianMTModel #library blip (image captioning) dan marian untuk translate import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Model untuk menghasilkan caption dalam bahasa Indonesia translation_model_id = "Helsinki-NLP/opus-mt-en-id" translation_model = MarianMTModel.from_pretrained(translation_model_id) translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_id) # Model untuk menghasilkan caption dalam bahasa Inggris device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # caption_model_id = "Salesforce/blip-image-captioning-base" # caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_id) # caption_processor = BlipProcessor.from_pretrained(caption_model_id) def generate_caption(input_image): # Mengonversi gambar ke mode RGB image = input_image.convert('RGB') inputs = caption_processor(images=image, return_tensors="pt") # inisial variabel inputs inputs["max_length"] = 20 inputs["num_beams"] = 1 inputs["do_sample"] = True inputs["top_k"] = 50 inputs["top_p"] = 0.95 # Menghasilkan caption dalam bahasa Inggris # caption_inputs = caption_processor() caption_output = caption_model.generate(**inputs) english_caption = caption_processor.decode(caption_output[0], skip_special_tokens=True) # Menerjemahkan caption ke bahasa Indonesia translation_inputs = translation_tokenizer.encode(english_caption, return_tensors="pt", max_length=512, truncation=True) translation_output = translation_model.generate(translation_inputs) indonesian_caption = translation_tokenizer.decode(translation_output[0], skip_special_tokens=True) return english_caption, indonesian_caption iface = gr.Interface( generate_caption, inputs=gr.inputs.Image(type="pil"), outputs=[gr.outputs.Textbox(type="text"), gr.outputs.Textbox(type="text")], # Dua output teks live=True ) iface.launch()