DAMO-NLP-SG
/

VideoLLaMA3-7B

Visual Question Answering

videollama3_qwen2

text-generation

large-language-model

video-language-model

Model card Files Files and versions Community

lkhl commited on 3 days ago

Commit

fd2ad62

·

verified ·

1 Parent(s): edde2a5

Update processing_videollama3.py

Files changed (1) hide show

processing_videollama3.py +9 -8

processing_videollama3.py CHANGED Viewed

@@ -681,14 +681,15 @@ class Videollama3Qwen2Processor(ProcessorMixin):
         kwargs.pop("padding")
         kwargs.pop("padding_side")
-        image_idx = 0
-        while DEFAULT_IMAGE_TOKEN in text:
-            num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
-            text = text.replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * num_tokens, 1)
-            image_idx += 1
-        text = text.replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
-        assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
         text_inputs = self.tokenizer(text, **kwargs)
         return text_inputs

         kwargs.pop("padding")
         kwargs.pop("padding_side")
+        if len(grid_sizes) > 0:
+            image_idx = 0
+            while DEFAULT_IMAGE_TOKEN in text:
+                num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
+                text = text.replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * num_tokens, 1)
+                image_idx += 1
+            text = text.replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
+            assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
         text_inputs = self.tokenizer(text, **kwargs)
         return text_inputs