Spaces:

de-Rodrigo
/

saliencies

Running on Zero

App Files Files Community

de-Rodrigo commited on Sep 25, 2024

Commit

f53adeb

1 Parent(s): bedfdc1

Visualize Saliency Maps

Browse files

Files changed (3) hide show

__pycache__/utils.cpython-310.pyc +0 -0
app.py +74 -36
utils.py +157 -0

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.56 kB). View file

app.py CHANGED Viewed

@@ -10,22 +10,22 @@ import re
 import logging
 from datasets import load_dataset
 import os
 # Logging configuration
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Paths to the static image and GIF
 README_IMAGE_PATH = os.path.join("figs", "saliencies-merit-dataset.png")
 GIF_PATH = os.path.join("figs", "demo-samples.gif")
 # Global variables for Donut model, processor, and dataset
-donut_model = None
-donut_processor = None
 dataset = None
 def load_merit_dataset():
     global dataset
     if dataset is None:
@@ -34,7 +34,6 @@ def load_merit_dataset():
         )
     return dataset
 def get_image_from_dataset(index):
     global dataset
     if dataset is None:
@@ -42,44 +41,84 @@ def get_image_from_dataset(index):
     image_data = dataset[int(index)]["image"]
     return image_data
 def get_collection_models(tag: str) -> List[str]:
     """Get a list of models from a specific Hugging Face collection."""
     models = list_models(author="de-Rodrigo")
     return [model.modelId for model in models if tag in model.tags]
-@spaces.GPU
-def get_donut():
-    global donut_model, donut_processor
-    if donut_model is None or donut_processor is None:
-        try:
-            donut_model = VisionEncoderDecoderModel.from_pretrained(
-                "de-Rodrigo/donut-merit"
-            )
-            donut_processor = DonutProcessor.from_pretrained("de-Rodrigo/donut-merit")
-            donut_model = donut_model.to("cuda")
-            logger.info("Donut model loaded successfully on GPU")
-        except Exception as e:
-            logger.error(f"Error loading Donut model: {str(e)}")
-            raise
-    return donut_model, donut_processor
-@spaces.GPU
-def process_image_donut(model, processor, image):
     try:
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         pixel_values = processor(image, return_tensors="pt").pixel_values.to("cuda")
         task_prompt = "<s_cord-v2>"
         decoder_input_ids = processor.tokenizer(
             task_prompt, add_special_tokens=False, return_tensors="pt"
         )["input_ids"].to("cuda")
-        outputs = model.generate(
             pixel_values,
             decoder_input_ids=decoder_input_ids,
             max_length=model.decoder.config.max_position_embeddings,
@@ -90,8 +129,11 @@ def process_image_donut(model, processor, image):
             num_beams=1,
             bad_words_ids=[[processor.tokenizer.unk_token_id]],
             return_dict_in_generate=True,
         )
         sequence = processor.batch_decode(outputs.sequences)[0]
         sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(
             processor.tokenizer.pad_token, ""
@@ -99,31 +141,27 @@ def process_image_donut(model, processor, image):
         sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
         result = processor.token2json(sequence)
-        return json.dumps(result, indent=2)
     except Exception as e:
         logger.error(f"Error processing image with Donut: {str(e)}")
-        return f"Error: {str(e)}"
-@spaces.GPU
 def process_image(model_name, image=None, dataset_image_index=None):
     if dataset_image_index is not None:
         image = get_image_from_dataset(dataset_image_index)
     if model_name == "de-Rodrigo/donut-merit":
-        model, processor = get_donut()
-        result = process_image_donut(model, processor, image)
     else:
-        # Here you should implement processing for other models
-        result = f"Processing for model {model_name} not implemented"
-    return image, result
 def update_image(dataset_image_index):
     return get_image_from_dataset(dataset_image_index)
 if __name__ == "__main__":
     # Load the dataset
     load_merit_dataset()
@@ -180,7 +218,7 @@ if __name__ == "__main__":
             process_button = gr.Button("Process Image")
             with gr.Row():
-                output_image = gr.Image(label="Processed Image")
                 output_text = gr.Textbox(label="Result")
             # Update preview image when slider changes

 import logging
 from datasets import load_dataset
 import os
+import numpy as np
+from datetime import datetime
+# Importar utils y save_img si no están ya importados
+import utils
 # Logging configuration
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Paths to the static image and GIF
 README_IMAGE_PATH = os.path.join("figs", "saliencies-merit-dataset.png")
 GIF_PATH = os.path.join("figs", "demo-samples.gif")
 # Global variables for Donut model, processor, and dataset
 dataset = None
 def load_merit_dataset():
     global dataset
     if dataset is None:
         )
     return dataset
 def get_image_from_dataset(index):
     global dataset
     if dataset is None:
     image_data = dataset[int(index)]["image"]
     return image_data
 def get_collection_models(tag: str) -> List[str]:
     """Get a list of models from a specific Hugging Face collection."""
     models = list_models(author="de-Rodrigo")
     return [model.modelId for model in models if tag in model.tags]
+def initialize_donut():
+    try:
+        donut_model = VisionEncoderDecoderModel.from_pretrained(
+            "de-Rodrigo/donut-merit"
+        )
+        donut_processor = DonutProcessor.from_pretrained("de-Rodrigo/donut-merit")
+        donut_model = donut_model.to("cuda")
+        logger.info("Donut model loaded successfully on GPU")
+        return donut_model, donut_processor
+    except Exception as e:
+        logger.error(f"Error loading Donut model: {str(e)}")
+        raise
+def compute_saliency(outputs, pixels, donut_p, image):
+    token_logits = torch.stack(outputs.scores, dim=1)
+    token_probs = torch.softmax(token_logits, dim=-1)
+    token_texts = []
+    saliency_images = []
+    for token_index in range(len(token_probs[0])):
+        target_token_prob = token_probs[
+            0, token_index, outputs.sequences[0, token_index]
+        ]
+        if pixels.grad is not None:
+            pixels.grad.zero_()
+        target_token_prob.backward(retain_graph=True)
+        saliency = pixels.grad.data.abs().squeeze().mean(dim=0)
+        token_id = outputs.sequences[0][token_index].item()
+        token_text = donut_p.tokenizer.decode([token_id])
+        logger.info(f"Considered sequence token: {token_text}")
+        safe_token_text = re.sub(r'[<>:"/\\|?*]', "_", token_text)
+        current_datetime = datetime.now().strftime("%Y%m%d%H%M%S")
+        unique_safe_token_text = f"{safe_token_text}_{current_datetime}"
+        file_name = f"saliency_{unique_safe_token_text}.png"
+        saliency = utils.convert_tensor_to_rgba_image(saliency)
+        # Merge saliency image twice
+        saliency = utils.add_transparent_image(np.array(image), saliency)
+        saliency = utils.convert_rgb_to_rgba_image(saliency)
+        saliency = utils.add_transparent_image(np.array(image), saliency, 0.7)
+        saliency = utils.label_frame(saliency, token_text)
+        saliency_images.append(saliency)
+        token_texts.append(token_text)
+    return saliency_images, token_texts
+@spaces.GPU(duration=300)
+def process_image_donut(image):
     try:
+        model, processor = initialize_donut()
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         pixel_values = processor(image, return_tensors="pt").pixel_values.to("cuda")
+        pixel_values.requires_grad = True
         task_prompt = "<s_cord-v2>"
         decoder_input_ids = processor.tokenizer(
             task_prompt, add_special_tokens=False, return_tensors="pt"
         )["input_ids"].to("cuda")
+        outputs = model.generate.__wrapped__(
+            model,
             pixel_values,
             decoder_input_ids=decoder_input_ids,
             max_length=model.decoder.config.max_position_embeddings,
             num_beams=1,
             bad_words_ids=[[processor.tokenizer.unk_token_id]],
             return_dict_in_generate=True,
+            output_scores=True,
         )
+        saliency_images, token_texts = compute_saliency(outputs, pixel_values, processor, image)
         sequence = processor.batch_decode(outputs.sequences)[0]
         sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(
             processor.tokenizer.pad_token, ""
         sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
         result = processor.token2json(sequence)
+        return saliency_images, json.dumps(result, indent=2)
     except Exception as e:
         logger.error(f"Error processing image with Donut: {str(e)}")
+        return None, f"Error: {str(e)}"
+@spaces.GPU(duration=300)
 def process_image(model_name, image=None, dataset_image_index=None):
     if dataset_image_index is not None:
         image = get_image_from_dataset(dataset_image_index)
     if model_name == "de-Rodrigo/donut-merit":
+        saliency_images, result = process_image_donut(image)
     else:
+        # Aquí deberías implementar el procesamiento para otros modelos
+        saliency_images, result = None, f"Processing for model {model_name} not implemented"
+    return saliency_images, result
 def update_image(dataset_image_index):
     return get_image_from_dataset(dataset_image_index)
 if __name__ == "__main__":
     # Load the dataset
     load_merit_dataset()
             process_button = gr.Button("Process Image")
             with gr.Row():
+                output_image = gr.Gallery(label="Processed Saliency Images")
                 output_text = gr.Textbox(label="Result")
             # Update preview image when slider changes

utils.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import cv2
+import os
+import glob
+import numpy as np
+from datetime import datetime
+def add_transparent_image(
+    background, foreground, alpha_factor=1.0, x_offset=None, y_offset=None
+):
+    """
+    Function sourced from StackOverflow contributor Ben.
+    This function was found on StackOverflow and is the work of Ben, a contributor
+    to the community. We are thankful for Ben's assistance by providing this useful
+    method.
+    Original Source:
+    https://stackoverflow.com/questions/40895785/
+    using-opencv-to-overlay-transparent-image-onto-another-image
+    """
+    bg_h, bg_w, bg_channels = background.shape
+    fg_h, fg_w, fg_channels = foreground.shape
+    assert (
+        bg_channels == 3
+    ), f"background image should have exactly 3 channels (RGB). found:{bg_channels}"
+    assert (
+        fg_channels == 4
+    ), f"foreground image should have exactly 4 channels (RGBA). found:{fg_channels}"
+    # center by default
+    if x_offset is None:
+        x_offset = (bg_w - fg_w) // 2
+    if y_offset is None:
+        y_offset = (bg_h - fg_h) // 2
+    w = min(fg_w, bg_w, fg_w + x_offset, bg_w - x_offset)
+    h = min(fg_h, bg_h, fg_h + y_offset, bg_h - y_offset)
+    if w < 1 or h < 1:
+        return
+    # clip foreground and background images to the overlapping regions
+    bg_x = max(0, x_offset)
+    bg_y = max(0, y_offset)
+    fg_x = max(0, x_offset * -1)
+    fg_y = max(0, y_offset * -1)
+    foreground = foreground[fg_y : fg_y + h, fg_x : fg_x + w]
+    background_subsection = background[bg_y : bg_y + h, bg_x : bg_x + w]
+    # separate alpha and color channels from the foreground image
+    foreground_colors = foreground[:, :, :3]
+    foreground_colors = cv2.cvtColor(foreground_colors, cv2.COLOR_BGR2RGB)
+    alpha_channel = foreground[:, :, 3] / 255 * alpha_factor  # 0-255 => 0.0-1.0
+    # construct an alpha_mask that matches the image shape
+    alpha_mask = np.dstack((alpha_channel, alpha_channel, alpha_channel))
+    # combine the background with the overlay image weighted by alpha
+    composite = (
+        background_subsection * (1 - alpha_mask) + foreground_colors * alpha_mask
+    )
+    # overwrite the section of the background image that has been updated
+    background[bg_y : bg_y + h, bg_x : bg_x + w] = composite
+    return background
+def convert_tensor_to_rgba_image(tensor):
+    saliency_array = tensor.cpu().numpy()
+    # Normalize img a 0-255
+    if saliency_array.dtype != np.uint8:
+        saliency_array = (255 * saliency_array / saliency_array.max()).astype(np.uint8)
+    heatmap = cv2.applyColorMap(saliency_array, cv2.COLORMAP_JET)
+    # Pixels are transparent where no saliency [128, 0, 0] is black in COLORMAP_JET
+    alpha_channel = np.ones(heatmap.shape[:2], dtype=heatmap.dtype) * 255
+    black_pixels_mask = np.all(heatmap == [128, 0, 0], axis=-1)
+    alpha_channel[black_pixels_mask] = 0
+    # Combinar los canales RGB y alfa
+    saliency_rgba = cv2.merge((heatmap, alpha_channel))
+    return saliency_rgba
+def convert_rgb_to_rgba_image(image):
+    alpha_channel = np.ones(image.shape[:2], dtype=image.dtype) * 255
+    rbga = cv2.merge((cv2.cvtColor(image, cv2.COLOR_RGB2BGR), alpha_channel))
+    return rbga
+def label_frame(image, token):
+    # Add the text
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.7
+    text_color = (255, 255, 255)
+    text_thickness = 1
+    text_size, _ = cv2.getTextSize(token, font, font_scale, text_thickness)
+    text_position = (10, 10 + text_size[1])
+    # Draw a rectangle behind the text
+    rectangle_color = (0, 0, 0)
+    rectangle_thickness = -1
+    rectangle_position = (10, 10)
+    rectangle_size = (text_size[0] + 5, text_size[1] + 5)
+    cv2.rectangle(
+        image,
+        rectangle_position,
+        (
+            rectangle_position[0] + rectangle_size[0],
+            rectangle_position[1] + rectangle_size[1],
+        ),
+        rectangle_color,
+        rectangle_thickness,
+    )
+    cv2.putText(
+        image, token, text_position, font, font_scale, text_color, text_thickness
+    )
+    return image
+def saliency_video(path, sequence):
+    image_files = sorted(glob.glob(os.path.join(path, "*.png")), key=os.path.getctime)
+    image = cv2.imread(image_files[0])
+    height = image.shape[0]
+    widht = image.shape[1]
+    # Create a VideoWriter object to save the video
+    video_name = os.path.join(path, "saliency.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    video = cv2.VideoWriter(video_name, fourcc, 5, (widht, height))
+    for image_file, token in zip(image_files, sequence):
+        image = cv2.imread(image_file)
+        # Write the image to the video
+        video.write(image)
+    # Release the VideoWriter object
+    video.release()
+    print(f"Video saved as {video_name}")