tcm03
commited on
Commit
·
e87d45a
0
Parent(s):
LongVU configs
Browse files- .gitattributes +2 -0
- README.md +136 -0
- config.json +83 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- special_tokens_map.json +20 -0
- tokenizer.json +0 -0
- tokenizer_config.json +53 -0
- vocab.json +0 -0
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
datasets:
|
3 |
+
- shenxq/OneVision
|
4 |
+
- shenxq/VideoChat2
|
5 |
+
base_model:
|
6 |
+
- Vision-CAIR/LongVU_Qwen2_7B_img
|
7 |
+
pipeline_tag: video-text-to-text
|
8 |
+
model-index:
|
9 |
+
- name: llava-onevision-qwen-7b-ov
|
10 |
+
results:
|
11 |
+
- task:
|
12 |
+
type: multimodal
|
13 |
+
dataset:
|
14 |
+
name: EgoSchema
|
15 |
+
type: egoschema
|
16 |
+
metrics:
|
17 |
+
- type: accuracy
|
18 |
+
value: 67.6
|
19 |
+
name: accuracy
|
20 |
+
verified: true
|
21 |
+
- task:
|
22 |
+
type: multimodal
|
23 |
+
dataset:
|
24 |
+
name: MLVU
|
25 |
+
type: mlvu
|
26 |
+
metrics:
|
27 |
+
- type: accuracy
|
28 |
+
value: 65.4
|
29 |
+
name: accuracy
|
30 |
+
verified: true
|
31 |
+
- task:
|
32 |
+
type: multimodal
|
33 |
+
dataset:
|
34 |
+
name: MVBench
|
35 |
+
type: mvbench
|
36 |
+
metrics:
|
37 |
+
- type: accuracy
|
38 |
+
value: 66.9
|
39 |
+
name: accuracy
|
40 |
+
verified: true
|
41 |
+
- task:
|
42 |
+
type: multimodal
|
43 |
+
dataset:
|
44 |
+
name: VideoMME
|
45 |
+
type: videomme
|
46 |
+
metrics:
|
47 |
+
- type: accuracy
|
48 |
+
value: 60.6
|
49 |
+
name: accuracy
|
50 |
+
verified: true
|
51 |
+
---
|
52 |
+
# LongVU
|
53 |
+
|
54 |
+
This repository contains the model based on Qwen2-7B as presented in [LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding](https://huggingface.co/papers/2410.17434).
|
55 |
+
|
56 |
+
Play with the model on the [HF demo](https://huggingface.co/spaces/Vision-CAIR/LongVU).
|
57 |
+
|
58 |
+
<div align="left">
|
59 |
+
<a href='https://vision-cair.github.io/LongVU'><img src="https://longvu.s3.amazonaws.com/assets/demo.gif" alt="Demo GIF" style="width: 100%; max-width: 650px;"></a>
|
60 |
+
</div>
|
61 |
+
|
62 |
+
# Use
|
63 |
+
|
64 |
+
We provide the simple generation process for using our model. For more details, you could refer to [Github](https://github.com/Vision-CAIR/LongVU)
|
65 |
+
|
66 |
+
```python
|
67 |
+
# git clone https://github.com/Vision-CAIR/LongVU
|
68 |
+
import numpy as np
|
69 |
+
import torch
|
70 |
+
from longvu.builder import load_pretrained_model
|
71 |
+
from longvu.constants import (
|
72 |
+
DEFAULT_IMAGE_TOKEN,
|
73 |
+
IMAGE_TOKEN_INDEX,
|
74 |
+
)
|
75 |
+
from longvu.conversation import conv_templates, SeparatorStyle
|
76 |
+
from longvu.mm_datautils import (
|
77 |
+
KeywordsStoppingCriteria,
|
78 |
+
process_images,
|
79 |
+
tokenizer_image_token,
|
80 |
+
)
|
81 |
+
from decord import cpu, VideoReader
|
82 |
+
|
83 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
84 |
+
"./checkpoints/longvu_qwen", None, "cambrian_qwen",
|
85 |
+
)
|
86 |
+
|
87 |
+
model.eval()
|
88 |
+
video_path = "./examples/video1.mp4"
|
89 |
+
qs = "Describe this video in detail"
|
90 |
+
|
91 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
92 |
+
fps = float(vr.get_avg_fps())
|
93 |
+
frame_indices = np.array([i for i in range(0, len(vr), round(fps),)])
|
94 |
+
video = []
|
95 |
+
for frame_index in frame_indices:
|
96 |
+
img = vr[frame_index].asnumpy()
|
97 |
+
video.append(img)
|
98 |
+
video = np.stack(video)
|
99 |
+
image_sizes = [video[0].shape[:2]]
|
100 |
+
video = process_images(video, image_processor, model.config)
|
101 |
+
video = [item.unsqueeze(0) for item in video]
|
102 |
+
|
103 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
104 |
+
conv = conv_templates["qwen"].copy()
|
105 |
+
conv.append_message(conv.roles[0], qs)
|
106 |
+
conv.append_message(conv.roles[1], None)
|
107 |
+
prompt = conv.get_prompt()
|
108 |
+
|
109 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
|
110 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
111 |
+
keywords = [stop_str]
|
112 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
113 |
+
with torch.inference_mode():
|
114 |
+
output_ids = model.generate(
|
115 |
+
input_ids,
|
116 |
+
images=video,
|
117 |
+
image_sizes=image_sizes,
|
118 |
+
do_sample=False,
|
119 |
+
temperature=0.2,
|
120 |
+
max_new_tokens=128,
|
121 |
+
use_cache=True,
|
122 |
+
stopping_criteria=[stopping_criteria],
|
123 |
+
)
|
124 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
|
125 |
+
```
|
126 |
+
|
127 |
+
# Citation
|
128 |
+
|
129 |
+
```
|
130 |
+
@article{shen2024longvu,
|
131 |
+
title={LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding},
|
132 |
+
author={Shen, Xiaoqian and Xiong, Yunyang and Zhao, Changsheng and Wu, Lemeng and Chen, Jun and Zhu, Chenchen and Liu, Zechun and Xiao, Fanyi and Varadarajan, Balakrishnan and Bordes, Florian and Liu, Zhuang and Xu, Hu and J. Kim, Hyunwoo and Soran, Bilge and Krishnamoorthi, Raghuraman and Elhoseiny, Mohamed and Chandra, Vikas},
|
133 |
+
journal={arXiv:2410.17434},
|
134 |
+
year={2024}
|
135 |
+
}
|
136 |
+
```
|
config.json
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09051611-cambrian_qwenvl_t576_ov",
|
3 |
+
"architectures": [
|
4 |
+
"CambrianQwenForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 151643,
|
9 |
+
"connect_layer": 2,
|
10 |
+
"connector_depth": 3,
|
11 |
+
"connector_only": true,
|
12 |
+
"dino_threshold": 0.83,
|
13 |
+
"drop_threshold": 0.7,
|
14 |
+
"eos_token_id": 151645,
|
15 |
+
"frame_pos": false,
|
16 |
+
"freeze_mm_mlp_adapter": false,
|
17 |
+
"hidden_act": "silu",
|
18 |
+
"hidden_size": 3584,
|
19 |
+
"highres": true,
|
20 |
+
"highres_connect": false,
|
21 |
+
"image_aspect_ratio": "pad",
|
22 |
+
"image_position": 91,
|
23 |
+
"image_token_len": 144,
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 18944,
|
26 |
+
"is_st_sampler": false,
|
27 |
+
"lowres_token": 8,
|
28 |
+
"max_position_embeddings": 32768,
|
29 |
+
"max_window_layers": 28,
|
30 |
+
"mm_patch_merge_type": "flat",
|
31 |
+
"mm_projector_lr": null,
|
32 |
+
"mm_projector_type": "sva",
|
33 |
+
"mm_use_im_patch_token": false,
|
34 |
+
"mm_use_im_start_end": false,
|
35 |
+
"mm_vision_sampler_lr": null,
|
36 |
+
"mm_vision_select_feature": "patch",
|
37 |
+
"mm_vision_select_layer": -2,
|
38 |
+
"mm_vision_tower_aux_list": [
|
39 |
+
"siglip/CLIP-ViT-SO400M-14-384",
|
40 |
+
"facebook/dinov2-giant-res378"
|
41 |
+
],
|
42 |
+
"mm_vision_tower_aux_token_len_list": [
|
43 |
+
576,
|
44 |
+
576
|
45 |
+
],
|
46 |
+
"mm_vision_tower_lr": null,
|
47 |
+
"model_type": "cambrian_qwen",
|
48 |
+
"num_attention_heads": 28,
|
49 |
+
"num_hidden_layers": 28,
|
50 |
+
"num_key_value_heads": 4,
|
51 |
+
"num_of_vision_sampler_layers": 10,
|
52 |
+
"num_query_group": 1,
|
53 |
+
"pretraining_tp": 1,
|
54 |
+
"query_num_list": [
|
55 |
+
144
|
56 |
+
],
|
57 |
+
"rms_norm_eps": 1e-06,
|
58 |
+
"rope_scaling": null,
|
59 |
+
"rope_theta": 1000000.0,
|
60 |
+
"sliding_window": null,
|
61 |
+
"spmd_debug": null,
|
62 |
+
"spmd_fsdp_sharding": null,
|
63 |
+
"spmd_mesh": null,
|
64 |
+
"start_of_vision_sampler_layers": 0,
|
65 |
+
"stride_of_vision_sampler_layers": 3,
|
66 |
+
"tie_word_embeddings": false,
|
67 |
+
"tokenizer_model_max_length": 10000,
|
68 |
+
"tokenizer_padding_side": "right",
|
69 |
+
"torch_dtype": "float32",
|
70 |
+
"transformers_version": "4.44.2",
|
71 |
+
"tune_mm_mlp_adapter": false,
|
72 |
+
"unfreeze_mm_vision_tower": false,
|
73 |
+
"use_cache": false,
|
74 |
+
"use_mm_proj": true,
|
75 |
+
"use_pos_skipping": false,
|
76 |
+
"use_sliding_window": false,
|
77 |
+
"vision_hidden_size": 1024,
|
78 |
+
"vision_tower_aux_token_len_list": [
|
79 |
+
576,
|
80 |
+
576
|
81 |
+
],
|
82 |
+
"vocab_size": 152064
|
83 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e846f373072ab8e42ee7963e21514d543696ee2859c30570bb1b05a88d94f3ca
|
3 |
+
size 15343381968
|
special_tokens_map.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"eos_token": {
|
7 |
+
"content": "<|im_end|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"pad_token": {
|
14 |
+
"content": "<|endoftext|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
}
|
20 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"151643": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"151644": {
|
13 |
+
"content": "<|im_start|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"151645": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
},
|
28 |
+
"151646": {
|
29 |
+
"content": "<image>",
|
30 |
+
"lstrip": false,
|
31 |
+
"normalized": false,
|
32 |
+
"rstrip": false,
|
33 |
+
"single_word": false,
|
34 |
+
"special": true
|
35 |
+
}
|
36 |
+
},
|
37 |
+
"additional_special_tokens": [
|
38 |
+
"<|im_start|>",
|
39 |
+
"<|im_end|>"
|
40 |
+
],
|
41 |
+
"bos_token": null,
|
42 |
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
43 |
+
"clean_up_tokenization_spaces": false,
|
44 |
+
"eos_token": "<|im_end|>",
|
45 |
+
"errors": "replace",
|
46 |
+
"model_max_length": 32768,
|
47 |
+
"pad_token": "<|endoftext|>",
|
48 |
+
"padding_side": "right",
|
49 |
+
"processor_class": "LlavaProcessor",
|
50 |
+
"split_special_tokens": false,
|
51 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
52 |
+
"unk_token": null
|
53 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|