This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-VL-7B-Instruct but with smaller size.

Usage:

from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

model_id = "yujiepan/qwen2-vl-tiny-random"

# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)

# Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")

output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)

Codes:

import os
from typing import Dict

import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                          AutoTokenizer, GenerationConfig, pipeline, set_seed)
from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration

model_id = "Qwen/Qwen2-VL-7B-Instruct"
repo_id = "yujiepan/qwen2-vl-tiny-random"
save_path = f"/tmp/{repo_id}"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.hidden_size = 16
config.intermediate_size = 32
config.num_attention_heads = 2
config.num_hidden_layers = 2
config.num_key_value_heads = 1
config.vision_config.embed_dim = 16
config.vision_config.num_heads = 2
config.vision_config.hidden_size = 16
config.vision_config.depth = 2
config.rope_scaling['mrope_section'] = [1, 1, 2]  # sum needs to be 4 here

model = Qwen2VLForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
    model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
    for _, p in sorted(model.named_parameters()):
        torch.nn.init.uniform_(p, -0.3, 0.3)

processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")


def try_inference():
    url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
    image = Image.open(requests.get(url, stream=True).raw)
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                },
                {"type": "text", "text": "Describe this image."},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(save_path)
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        save_path, torch_dtype=torch.bfloat16, device_map='cuda')
    text_prompt = processor.apply_chat_template(
        conversation, add_generation_prompt=True)
    inputs = processor(
        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
    )
    inputs = inputs.to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=16)
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    print(output_text)


try_inference()
Downloads last month
34
Safetensors
Model size
4.9M params
Tensor type
BF16
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Model tree for yujiepan/qwen2-vl-tiny-random

Base model

Qwen/Qwen2-VL-7B
Finetuned
(116)
this model

Collection including yujiepan/qwen2-vl-tiny-random