tcm03 commited on
Commit
e87d45a
·
0 Parent(s):

LongVU configs

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - shenxq/OneVision
4
+ - shenxq/VideoChat2
5
+ base_model:
6
+ - Vision-CAIR/LongVU_Qwen2_7B_img
7
+ pipeline_tag: video-text-to-text
8
+ model-index:
9
+ - name: llava-onevision-qwen-7b-ov
10
+ results:
11
+ - task:
12
+ type: multimodal
13
+ dataset:
14
+ name: EgoSchema
15
+ type: egoschema
16
+ metrics:
17
+ - type: accuracy
18
+ value: 67.6
19
+ name: accuracy
20
+ verified: true
21
+ - task:
22
+ type: multimodal
23
+ dataset:
24
+ name: MLVU
25
+ type: mlvu
26
+ metrics:
27
+ - type: accuracy
28
+ value: 65.4
29
+ name: accuracy
30
+ verified: true
31
+ - task:
32
+ type: multimodal
33
+ dataset:
34
+ name: MVBench
35
+ type: mvbench
36
+ metrics:
37
+ - type: accuracy
38
+ value: 66.9
39
+ name: accuracy
40
+ verified: true
41
+ - task:
42
+ type: multimodal
43
+ dataset:
44
+ name: VideoMME
45
+ type: videomme
46
+ metrics:
47
+ - type: accuracy
48
+ value: 60.6
49
+ name: accuracy
50
+ verified: true
51
+ ---
52
+ # LongVU
53
+
54
+ This repository contains the model based on Qwen2-7B as presented in [LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding](https://huggingface.co/papers/2410.17434).
55
+
56
+ Play with the model on the [HF demo](https://huggingface.co/spaces/Vision-CAIR/LongVU).
57
+
58
+ <div align="left">
59
+ <a href='https://vision-cair.github.io/LongVU'><img src="https://longvu.s3.amazonaws.com/assets/demo.gif" alt="Demo GIF" style="width: 100%; max-width: 650px;"></a>
60
+ </div>
61
+
62
+ # Use
63
+
64
+ We provide the simple generation process for using our model. For more details, you could refer to [Github](https://github.com/Vision-CAIR/LongVU)
65
+
66
+ ```python
67
+ # git clone https://github.com/Vision-CAIR/LongVU
68
+ import numpy as np
69
+ import torch
70
+ from longvu.builder import load_pretrained_model
71
+ from longvu.constants import (
72
+ DEFAULT_IMAGE_TOKEN,
73
+ IMAGE_TOKEN_INDEX,
74
+ )
75
+ from longvu.conversation import conv_templates, SeparatorStyle
76
+ from longvu.mm_datautils import (
77
+ KeywordsStoppingCriteria,
78
+ process_images,
79
+ tokenizer_image_token,
80
+ )
81
+ from decord import cpu, VideoReader
82
+
83
+ tokenizer, model, image_processor, context_len = load_pretrained_model(
84
+ "./checkpoints/longvu_qwen", None, "cambrian_qwen",
85
+ )
86
+
87
+ model.eval()
88
+ video_path = "./examples/video1.mp4"
89
+ qs = "Describe this video in detail"
90
+
91
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
92
+ fps = float(vr.get_avg_fps())
93
+ frame_indices = np.array([i for i in range(0, len(vr), round(fps),)])
94
+ video = []
95
+ for frame_index in frame_indices:
96
+ img = vr[frame_index].asnumpy()
97
+ video.append(img)
98
+ video = np.stack(video)
99
+ image_sizes = [video[0].shape[:2]]
100
+ video = process_images(video, image_processor, model.config)
101
+ video = [item.unsqueeze(0) for item in video]
102
+
103
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
104
+ conv = conv_templates["qwen"].copy()
105
+ conv.append_message(conv.roles[0], qs)
106
+ conv.append_message(conv.roles[1], None)
107
+ prompt = conv.get_prompt()
108
+
109
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
110
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
111
+ keywords = [stop_str]
112
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
113
+ with torch.inference_mode():
114
+ output_ids = model.generate(
115
+ input_ids,
116
+ images=video,
117
+ image_sizes=image_sizes,
118
+ do_sample=False,
119
+ temperature=0.2,
120
+ max_new_tokens=128,
121
+ use_cache=True,
122
+ stopping_criteria=[stopping_criteria],
123
+ )
124
+ pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
125
+ ```
126
+
127
+ # Citation
128
+
129
+ ```
130
+ @article{shen2024longvu,
131
+ title={LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding},
132
+ author={Shen, Xiaoqian and Xiong, Yunyang and Zhao, Changsheng and Wu, Lemeng and Chen, Jun and Zhu, Chenchen and Liu, Zechun and Xiao, Fanyi and Varadarajan, Balakrishnan and Bordes, Florian and Liu, Zhuang and Xu, Hu and J. Kim, Hyunwoo and Soran, Bilge and Krishnamoorthi, Raghuraman and Elhoseiny, Mohamed and Chandra, Vikas},
133
+ journal={arXiv:2410.17434},
134
+ year={2024}
135
+ }
136
+ ```
config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09051611-cambrian_qwenvl_t576_ov",
3
+ "architectures": [
4
+ "CambrianQwenForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 151643,
9
+ "connect_layer": 2,
10
+ "connector_depth": 3,
11
+ "connector_only": true,
12
+ "dino_threshold": 0.83,
13
+ "drop_threshold": 0.7,
14
+ "eos_token_id": 151645,
15
+ "frame_pos": false,
16
+ "freeze_mm_mlp_adapter": false,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 3584,
19
+ "highres": true,
20
+ "highres_connect": false,
21
+ "image_aspect_ratio": "pad",
22
+ "image_position": 91,
23
+ "image_token_len": 144,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 18944,
26
+ "is_st_sampler": false,
27
+ "lowres_token": 8,
28
+ "max_position_embeddings": 32768,
29
+ "max_window_layers": 28,
30
+ "mm_patch_merge_type": "flat",
31
+ "mm_projector_lr": null,
32
+ "mm_projector_type": "sva",
33
+ "mm_use_im_patch_token": false,
34
+ "mm_use_im_start_end": false,
35
+ "mm_vision_sampler_lr": null,
36
+ "mm_vision_select_feature": "patch",
37
+ "mm_vision_select_layer": -2,
38
+ "mm_vision_tower_aux_list": [
39
+ "siglip/CLIP-ViT-SO400M-14-384",
40
+ "facebook/dinov2-giant-res378"
41
+ ],
42
+ "mm_vision_tower_aux_token_len_list": [
43
+ 576,
44
+ 576
45
+ ],
46
+ "mm_vision_tower_lr": null,
47
+ "model_type": "cambrian_qwen",
48
+ "num_attention_heads": 28,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 4,
51
+ "num_of_vision_sampler_layers": 10,
52
+ "num_query_group": 1,
53
+ "pretraining_tp": 1,
54
+ "query_num_list": [
55
+ 144
56
+ ],
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": null,
59
+ "rope_theta": 1000000.0,
60
+ "sliding_window": null,
61
+ "spmd_debug": null,
62
+ "spmd_fsdp_sharding": null,
63
+ "spmd_mesh": null,
64
+ "start_of_vision_sampler_layers": 0,
65
+ "stride_of_vision_sampler_layers": 3,
66
+ "tie_word_embeddings": false,
67
+ "tokenizer_model_max_length": 10000,
68
+ "tokenizer_padding_side": "right",
69
+ "torch_dtype": "float32",
70
+ "transformers_version": "4.44.2",
71
+ "tune_mm_mlp_adapter": false,
72
+ "unfreeze_mm_vision_tower": false,
73
+ "use_cache": false,
74
+ "use_mm_proj": true,
75
+ "use_pos_skipping": false,
76
+ "use_sliding_window": false,
77
+ "vision_hidden_size": 1024,
78
+ "vision_tower_aux_token_len_list": [
79
+ 576,
80
+ 576
81
+ ],
82
+ "vocab_size": 152064
83
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e846f373072ab8e42ee7963e21514d543696ee2859c30570bb1b05a88d94f3ca
3
+ size 15343381968
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<image>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "additional_special_tokens": [
38
+ "<|im_start|>",
39
+ "<|im_end|>"
40
+ ],
41
+ "bos_token": null,
42
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "<|im_end|>",
45
+ "errors": "replace",
46
+ "model_max_length": 32768,
47
+ "pad_token": "<|endoftext|>",
48
+ "padding_side": "right",
49
+ "processor_class": "LlavaProcessor",
50
+ "split_special_tokens": false,
51
+ "tokenizer_class": "Qwen2Tokenizer",
52
+ "unk_token": null
53
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff