microsoft
/

Phi-3.5-vision-instruct

Image-Text-to-Text

text-generation

Model card Files Files and versions Community

Update processing_phi3_v.py

#30

by rageSpin - opened Oct 10

base: refs/heads/main

←

from: refs/pr/30

Discussion Files changed

Files changed (1) hide show

processing_phi3_v.py +22 -4

processing_phi3_v.py CHANGED Viewed

@@ -56,7 +56,7 @@ logger = logging.get_logger(__name__)
 if is_vision_available():
     from PIL import Image
-import torch
 import torchvision
 def padding_336(b):
@@ -205,6 +205,22 @@ class Phi3VImageProcessor(BaseImageProcessor):
         num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)
         return num_img_tokens
     def preprocess(
         self,
         images: ImageInput,
@@ -245,8 +261,8 @@ class Phi3VImageProcessor(BaseImageProcessor):
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
         image_sizes = []
         img_processor = torchvision.transforms.Compose([
@@ -256,7 +272,9 @@ class Phi3VImageProcessor(BaseImageProcessor):
         # PIL images
         # HD_transform pad images to size of multiiply of 336, 336
-        # convert to RGB first
         images = [image.convert('RGB') for image in images]
         elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
         # tensor transform and normalize

 if is_vision_available():
     from PIL import Image
+import torch  # why is this imported twice?
 import torchvision
 def padding_336(b):
         num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)
         return num_img_tokens
+    def convert_PIL(self, image):
+        """
+        Convert an image to a PIL Image object if it is not already one.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`): The image to be converted. Can be a numpy array or a torch tensor or PIL object.
+        Returns:
+            A PIL Image object.
+        """
+            if not isinstance(image, Image.Image):
+                return torchvision.transforms.functional.to_pil_image(image)
+            else:
+                return image
     def preprocess(
         self,
         images: ImageInput,
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
+        # if do_convert_rgb:
+            # images = [convert_to_rgb(image) for image in images]
         image_sizes = []
         img_processor = torchvision.transforms.Compose([
         # PIL images
         # HD_transform pad images to size of multiiply of 336, 336
+        # check and convert if the images are in PIL format
+        images = [convert_PIL(image) for image in images]
+        # convert to RGB first (I think the argument "do_convert_rgb is useless, since it is forced here")
         images = [image.convert('RGB') for image in images]
         elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
         # tensor transform and normalize