GoodiesHere commited on 8 days ago

Commit

6f910e1

•

1 Parent(s): f0b6580

Upload folder using huggingface_hub

Browse files

Files changed (36) hide show

.mdl +0 -0
.msc +0 -0
.mv +1 -0
README.md +121 -0
config.json +293 -0
configuration.json +1 -0
configuration_apollo.py +47 -0
llm/added_tokens.json +24 -0
llm/config.json +31 -0
llm/generation_config.json +14 -0
llm/merges.txt +0 -0
llm/model-00001-of-00004.safetensors +3 -0
llm/model-00002-of-00004.safetensors +3 -0
llm/model-00003-of-00004.safetensors +3 -0
llm/model-00004-of-00004.safetensors +3 -0
llm/model.safetensors.index.json +346 -0
llm/special_tokens_map.json +31 -0
llm/tokenizer.json +0 -0
llm/tokenizer_config.json +209 -0
llm/vocab.json +0 -0
mm_connector.py +306 -0
mm_connector/config.json +30 -0
mm_connector/configuration_connector.py +38 -0
mm_connector/model.safetensors +3 -0
modeling_apollo.py +492 -0
vision_tower.py +556 -0
vision_tower/config.json +18 -0
vision_tower/configuration_hybrid.py +48 -0
vision_tower/internvideo2/config.json +54 -0
vision_tower/internvideo2/configuration_internvideo2.py +91 -0
vision_tower/internvideo2/model.safetensors +3 -0
vision_tower/internvideo2/modeling_internvideo2.py +934 -0
vision_tower/internvideo2/preprocessor_config.json +30 -0
vision_tower/siglip-so400m-patch14-384/config.json +19 -0
vision_tower/siglip-so400m-patch14-384/model.safetensors +3 -0
vision_tower/siglip-so400m-patch14-384/preprocessor_config.json +24 -0

.mdl ADDED Viewed

Binary file (48 Bytes). View file

.msc ADDED Viewed

Binary file (2.81 kB). View file

.mv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Revision:master,CreatedAt:1734398315

README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+---
+license: apache-2.0
+language:
+- en
+pipeline_tag: video-text-to-text
+tags:
+- video
+- video-understanding
+- vision
+- multimodal
+- conversational
+- custom_code
+- instruction-tuning
+library_name: transformers
+---
+# Apollo: An Exploration of Video Understanding in Large Multimodal Models
+Apollo is a family of Large Multimodal Models (LMMs) that push the state-of-the-art in video understanding. It supports tasks including:
+- Long-form video comprehension
+- Temporal reasoning
+- Complex video question-answering
+- Multi-turn conversations grounded in video content
+Apollo models excel at handling hour-long videos, balancing speed and accuracy through strategic design decisions. Our models outperform most 7B competitors at just 3B parameters and even rival 30B-scale models.
+**Key Highlights:**
+- **7B model varient**
+- **32 tokens/frame**
+## Quick Start
+**Installation:**
+```bash
+pip install -e .
+pip install flash-attn --no-build-isolation
+```
+**Inference Example:**
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from apollo.mm_utils import (
+    KeywordsStoppingCriteria,
+    tokenizer_mm_token,
+    ApolloMMLoader
+)
+from apollo.conversations import conv_templates, SeparatorStyle
+from huggingface_hub import snapshot_download
+model_url = "Apollo-LMMs/Apollo-3B-t32"
+model_path = snapshot_download(model_url, repo_type="model")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True
+).to(device=device, dtype=torch.bfloat16)
+tokenizer = model.tokenizer
+vision_processors = model.vision_tower.vision_processor
+config = model.config
+num_repeat_token = config.mm_connector_cfg['num_output_tokens']
+mm_processor = ApolloMMLoader(
+    vision_processors,
+    config.clip_duration,
+    frames_per_clip=4,
+    clip_sampling_ratio=0.65,
+    model_max_length=config.model_max_length,
+    device=device,
+    num_repeat_token=num_repeat_token
+)
+video_path = "path/to/video.mp4"
+question = "Describe this video in detail"
+mm_data, replace_string = mm_processor.load_video(video_path)
+conv = conv_templates["qwen_2"].copy()
+conv.append_message(conv.roles[0], replace_string + "\n\n" + question)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+input_ids = tokenizer_mm_token(prompt, tokenizer, return_tensors="pt").unsqueeze(0).to(device)
+stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
+with torch.inference_mode():
+    output_ids = model.generate(
+        input_ids,
+        vision_input=[mm_data],
+        data_types=['video'],
+        do_sample=True,
+        temperature=0.4,
+        max_new_tokens=256,
+        top_p=0.7,
+        use_cache=True,
+        num_beams=1,
+        stopping_criteria=[stopping_criteria]
+    )
+pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+print(pred)
+```
+## Citation
+If you find this project useful, please consider citing:
+```BibTeX
+@article{zohar2024apollo,
+    title={Apollo: An Exploration of Video Understanding in Large Multimodal Models},
+    author={Zohar, Orr and Wang, Xiaohan and Dubois, Yann and Mehta, Nikhil and Xiao, Tong and Hansen-Estruch, Philippe and Yu, Licheng and Wang, Xiaofang and Juefei-Xu, Felix and Zhang, Ning and Yeung-Levy, Serena and Xia, Xide},
+    journal={arXiv preprint arXiv:2412.10360},
+    year={2024}
+}
+```
+For more details, visit the [project website](https://apollo-lmms.github.io) or check out the [paper](https://arxiv.org/abs/2412.10360).

config.json ADDED Viewed

	@@ -0,0 +1,293 @@

+{
+  "architectures": [
+    "ApolloForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_implementation": "flash_attention_2",
+  "clip_duration": 2,
+  "drop_path_rate": 0.0,
+  "encode_batch_size": 15,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "square",
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "interpolate_mode": "linear",
+  "llm_cfg": {
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "min_length": 0,
+    "model_max_length": 16384,
+    "model_type": "qwen2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 28,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "tokenizer_model_max_length": 16384,
+    "tokenizer_padding_side": "right",
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "mm_connector_cfg": {
+    "add_cross_attention": false,
+    "architectures": [
+      "Connector"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "ff_multi": 4,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "mm_connector",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_key_value_heads": 4,
+    "num_output_tokens": 128,
+    "num_patches": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "projector_type": "mlp1x_gelu",
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "resampler_depth": 1,
+    "resampler_head_dim": 96,
+    "resampler_n_heads": 16,
+    "resampler_type": "perciver",
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "text_hidden_size": 3584,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "token_input_shape": [
+      4,
+      27,
+      27
+    ],
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vision_hidden_size": 2560
+  },
+  "mm_connector_lr": 0.0001,
+  "mm_hidden_size": null,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "model_dtype": "torch.bfloat16",
+  "model_type": "apollo",
+  "num_attention_heads": 28,
+  "num_encode_batch": 0,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "num_video_frames": null,
+  "resume_path": "./work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft",
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "s2": false,
+  "s2_max_split_size": 336,
+  "s2_scales": "336,672,1008",
+  "sliding_window": null,
+  "temporal_prompt": true,
+  "timestamp_prompt": true,
+  "transformers_version": "4.44.0",
+  "tune_language_model": true,
+  "tune_mm_connector": true,
+  "tune_vision_tower": false,
+  "use_cache": true,
+  "use_mm_patch_token": false,
+  "use_mm_start_end": false,
+  "use_sliding_window": false,
+  "vision_resolution": -1,
+  "vision_tower_cfg": {
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "configs": {},
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "hybrid_vision_tower",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "num_vision_encoders": 2,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "token_output_shape": [
+      4,
+      27,
+      27
+    ],
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vision_towers": [
+      "siglip-so400m-patch14-384",
+      "internvideo2"
+    ]
+  },
+  "vocab_size": 152064,
+  "auto_map": {
+    "AutoConfig": "configuration_apollo.ApolloConfig",
+    "AutoModelForCausalLM": "modeling_apollo.ApolloForCausalLM"
+  },
+  "model_max_length": 16384
+}

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework": "pytorch", "task": "video-text-to-text", "allow_remote": true}

configuration_apollo.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#from transformers import PretrainedConfig
+from transformers import PretrainedConfig
+class ApolloConfig(PretrainedConfig):
+    model_type = "apollo"
+    def __init__(
+        self,
+        llm_cfg=None,
+        vision_tower_cfg=None,
+        mm_connector_cfg=None,
+        architectures=None,
+        resume_path=None,
+        image_aspect_ratio=None,
+        num_video_frames=None,
+        mm_vision_select_layer=None,
+        mm_vision_select_feature=None,
+        use_mm_start_end=False,
+        use_mm_patch_token=True,
+        mm_connector_lr=None,
+        vision_resolution=None,
+        interpolate_mode=None,
+        clip_duration=None,
+        vocab_size=None,
+        auto_map=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.llm_cfg = llm_cfg
+        self.vision_tower_cfg = vision_tower_cfg
+        self.mm_connector_cfg = mm_connector_cfg
+        self.resume_path = resume_path
+        self.image_aspect_ratio = image_aspect_ratio
+        self.num_video_frames = num_video_frames
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.use_mm_start_end = use_mm_start_end
+        self.use_mm_patch_token = use_mm_patch_token
+        self.mm_connector_lr = mm_connector_lr
+        self.vision_resolution = vision_resolution
+        self.interpolate_mode = interpolate_mode
+        self.clip_duration = clip_duration
+        self.vocab_size=vocab_size
+        self.auto_map=auto_map

llm/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

llm/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "./work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft/llm",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_max_length": 16384,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 16384,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.44.0"
+}

llm/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad28602d062e7ce6f51c4343652cef63168989c08ad1a47c11e64033c6c441ef
+size 4877660776

llm/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f392fe912c9e60fa81d1ceff9994a769f4a08f6bb63b6d92ce6ef26fbdb1704
+size 4932751008

llm/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd8091303478c06d62188b50f1a3af122ac7bc8d2396bfda7d7a4d4d56693ec
+size 4330865200

llm/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:576065d92cfe1cfc13576cd1327672d757ef36457f4fcba9e17f0ae90a4024b7
+size 1089994880

llm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 15231233024
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llm/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "legacy": false,
+  "model_max_length": 16384,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

llm/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mm_connector.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import re, math, torch
+from collections import OrderedDict
+from typing import Optional, Tuple
+from torch import nn
+from torch.nn.init import trunc_normal_, normal_
+import torch.utils.checkpoint
+from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+ACT2CLS = {"silu": nn.SiLU}
+ACT2FN = ClassInstantier(ACT2CLS)
+class WeightedNorm(nn.Module):
+    def __init__(self, hidden_size):
+        """
+        WeightedNorm
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm = nn.LayerNorm(self.hidden_size)
+        self.wheight = nn.Parameter(torch.ones(self.hidden_size))
+        normal_(self.wheight, mean=1, std=.02)
+    def forward(self, x):
+        x = self.norm(x)
+        return x * self.wheight
+class PerceiverMLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            output_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, output_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class PerceiverAttention(nn.Module):
+    def __init__(self, connector_config, layer_idx: Optional[int] = None) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.layer_idx = None
+        self.hidden_size = connector_config.text_hidden_size
+        self.num_heads = connector_config.resampler_n_heads
+        self.head_dim = connector_config.resampler_head_dim
+        self.num_key_value_heads = connector_config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.is_causal = False
+    def forward(
+            self,
+            latents: torch.Tensor,
+            context: torch.Tensor,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+        Args:
+            latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
+            context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
+            output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
+            use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching.
+        """
+        bsz, q_len, _ = latents.size()
+        kv_seq_len = q_len + context.size()[1]
+        hidden_states = torch.concat([context, latents], dim=-2)
+        query_states = self.q_proj(latents)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+PERCEIVER_ATTENTION_CLASSES = {
+    "eager": PerceiverAttention,
+}
+class PerceiverLayer(nn.Module):
+    def __init__(self, connector_config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = connector_config.text_hidden_size
+        self.n_latents = connector_config.num_output_tokens
+        self.depth = connector_config.resampler_depth
+        self.ff_multi = connector_config.ff_multi
+        self.input_latents_norm = WeightedNorm(self.hidden_size)
+        self.input_context_norm = WeightedNorm(self.hidden_size)
+        self.self_attn = PERCEIVER_ATTENTION_CLASSES[connector_config._attn_implementation](connector_config,
+                                                                                            layer_idx=layer_idx)
+        self.post_attention_layernorm = WeightedNorm(self.hidden_size)
+        self.mlp = PerceiverMLP(
+            hidden_size=connector_config.text_hidden_size,
+            intermediate_size=connector_config.text_hidden_size * self.ff_multi,
+            output_size=connector_config.text_hidden_size,
+            hidden_act=connector_config.hidden_act,
+        )
+    def forward(
+            self,
+            latents: torch.Tensor,
+            context: torch.Tensor,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = latents
+        latents = self.input_latents_norm(latents)
+        context = self.input_context_norm(context)
+        latents, self_attn_weights, present_key_value = self.self_attn(
+            latents=latents,
+            context=context,
+        )
+        latents = residual + latents
+        residual = latents
+        latents = self.post_attention_layernorm(latents)
+        latents = self.mlp(latents)
+        latents = residual + latents
+        outputs = (latents,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class PerceiverResampler(nn.Module):
+    """Perceiver Resampler that compresses input embeddings into a fixed number of latents."""
+    def __init__(self, connector_config) -> None:
+        super().__init__()
+        self.hidden_size = connector_config.text_hidden_size
+        self.hidden_act = connector_config.hidden_act
+        self.n_latents = connector_config.num_output_tokens
+        self.depth = connector_config.resampler_depth
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(torch.zeros(self.n_latents, self.hidden_size))
+        # Create Transformer Blocks
+        self.layers = nn.ModuleList([PerceiverLayer(connector_config, idx) for idx in range(self.depth)])
+        self.norm = WeightedNorm(self.hidden_size)
+        self._use_flash_attention_2 = connector_config._attn_implementation == "flash_attention_2"
+    def forward(
+            self,
+            context: torch.Tensor,
+            attention_mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        # seq embed -> bsz seq embed
+        latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size()))
+        compressed_context = latents
+        for i, perceiver_layer in enumerate(self.layers):
+            layer_outputs = perceiver_layer(
+                compressed_context,
+                context,
+                past_key_value=None,
+                output_attentions=False,
+                use_cache=False,
+            )
+            compressed_context = layer_outputs[0]
+        compressed_context = self.norm(compressed_context)
+        return compressed_context
+def build_mm_projector(
+    input_dim,
+    output_dim,
+    projector_type,
+    hidden_act='silu',
+    delay_load=False,
+    token_input_shape=0,
+    **kwargs
+    ) -> nn.Sequential:
+    modules = [nn.Linear(input_dim, output_dim)]
+    mlp_gelu_match = re.match(r'.*mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match is not None:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        for _ in range(mlp_depth - 1):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(output_dim, output_dim))
+    return nn.Sequential(*modules)
+class MMConnector(PreTrainedModel):
+    config_class = PretrainedConfig
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__(config)
+        self.proj = build_mm_projector(config.vision_hidden_size, config.text_hidden_size,
+                                       config.projector_type, token_input_shape=config.token_input_shape)
+        self.resampler = PerceiverResampler(config)
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.resampler(x)
+        return x

mm_connector/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "Connector"
+  ],
+  "attention_dropout": 0.0,
+  "ff_multi": 4,
+  "hidden_act": "silu",
+  "model_type": "mm_connector",
+  "num_key_value_heads": 4,
+  "num_output_tokens": 128,
+  "num_patches": 24,
+  "projector_type": "mlp1x_gelu",
+  "resampler_depth": 1,
+  "resampler_head_dim": 96,
+  "resampler_n_heads": 16,
+  "resampler_type": "perciver",
+  "rms_norm_eps": 1e-06,
+  "text_hidden_size": 3584,
+  "token_input_shape": [
+    4,
+    27,
+    27
+  ],
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
+  "vision_hidden_size": 2560,
+  "auto_map": {
+    "AutoConfig": "configuration_connector.ConnectorConfig"
+  }
+}

mm_connector/configuration_connector.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import torch.nn as nn
+from typing import Dict, List, Union
+from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
+import torch.nn.functional as F
+import json, os
+class ConnectorConfig(PretrainedConfig):
+    model_type = "mm_connector"
+    def __init__(
+        self,
+        vision_hidden_size: List[int] = [],
+        text_hidden_size: int = 0,
+        num_patches: int = 24,
+        rms_norm_eps: float = 1e-4,
+        token_input_shape: List[int] = [],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vision_hidden_size = vision_hidden_size
+        self.text_hidden_size = text_hidden_size
+        self.num_patches = num_patches
+        self.rms_norm_eps=rms_norm_eps
+        self.token_input_shape = token_input_shape
+    @classmethod
+    def load_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "ConnectorConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_from_json(pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
+    @classmethod
+    def get_config_from_json(cls, config_file, **kwargs):
+        with open(config_file, 'r') as file:
+            config_data = json.load(file)
+        return config_data, kwargs

mm_connector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2973ab0aaf61364d182eca589bdc80e28a815eb94112a83cb28d42d24da6156e
+size 355169704

modeling_apollo.py ADDED Viewed

	@@ -0,0 +1,492 @@

+from typing import List, Optional, Tuple, Union
+import warnings, os, torch
+import torch.nn as nn
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoTokenizer
+from transformers.modeling_utils import ContextManagers, no_init_weights
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from .configuration_apollo import ApolloConfig
+from .vision_tower import ApolloVisionTower
+from .mm_connector import MMConnector
+IGNORE_INDEX = -100
+X_TOKEN_INDEX = -200
+def get_model_config(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_connector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    return_pths = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_pths.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_pths.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_pths.append(cfg)
+    return_list = []
+    for pth in return_pths:
+        return_list.append(AutoConfig.from_pretrained(pth, trust_remote_code=True))
+    return return_list
+def build_llm_and_tokenizer(
+        llm_cfg: str,
+        config: PretrainedConfig,
+        attn_implementation=None,
+        model_max_length=None,
+        *args,
+        **kwargs,
+) -> PreTrainedModel:
+    llm_arch = getattr(llm_cfg, "architectures")[0].lower()
+    llm_path = llm_cfg._name_or_path
+    llm = AutoModelForCausalLM.from_pretrained(
+        llm_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        llm_path,
+        model_max_length=llm_cfg.model_max_length,
+        padding_side="right",
+        use_fast=False,
+        legacy=False,
+        **kwargs
+    )
+    #config.hidden_size = llm.config.hidden_size
+    return llm, tokenizer
+class ApolloForCausalLM(PreTrainedModel):
+    def __init__(self, config: ApolloConfig, *args, **kwargs):
+        super().__init__(config)
+        llm_cfg, vision_tower_cfg, mm_connector_cfg = get_model_config(config)
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
+            config.model_dtype = model_dtype
+        # Initialize weights and apply final processing
+        self.lm_head = nn.Linear(llm_cfg.hidden_size, config.vocab_size, bias=False)
+        self.vision_tower = ApolloVisionTower(config, vision_tower_cfg)
+        self.mm_connector = MMConnector.from_pretrained(mm_connector_cfg._name_or_path)
+        self.llm, self.tokenizer = build_llm_and_tokenizer(llm_cfg, config, *args, **kwargs)
+        self.post_init()
+        self.is_loaded = True
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            vision_input: Optional[List[torch.FloatTensor]] = None,
+            data_types: Optional[List[str]] = None,
+            return_dict: Optional[bool] = None,
+            cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                vision_input,
+                data_types
+            )
+        return self.get_llm().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            vision_input: Optional[List[torch.Tensor]] = None,
+            data_types: Optional[List[str]] = None,
+            **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if vision_input is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(
+                inputs, position_ids, attention_mask, None, None, vision_input, data_types=data_types)
+        else:
+            inputs_embeds = self.embed_tokens(inputs)
+        return self.get_llm().generate(position_ids=position_ids, attention_mask=attention_mask,
+                                       inputs_embeds=inputs_embeds, **kwargs)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        vision_input = kwargs.pop("vision_input", None)
+        data_types = kwargs.pop("data_types", None)
+        inputs = self.get_llm().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values,
+                                                              inputs_embeds=inputs_embeds, **kwargs)
+        if vision_input is not None:
+            inputs["vision_input"] = vision_input
+        if data_types is not None:
+            inputs["data_types"] = data_types
+        return inputs
+    @classmethod
+    def from_pretrained(
+            cls,
+            pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+            *model_args,
+            config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+            cache_dir: Optional[Union[str, os.PathLike]] = None,
+            ignore_mismatched_sizes: bool = False,
+            force_download: bool = False,
+            local_files_only: bool = False,
+            token: Optional[Union[str, bool]] = None,
+            revision: str = "main",
+            use_safetensors: bool = None,
+            **kwargs,
+    ):
+        return cls.load_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            **kwargs,
+        )
+    def get_llm(self):
+        return self.llm
+    def get_vision_tower(self):
+        return self.vision_tower
+    def get_mm_connector(self):
+        return self.mm_connector
+    @classmethod
+    def load_pretrained(cls, model_path_or_config, *args, **kwargs):
+        kwargs.pop("config", None)
+        if isinstance(model_path_or_config, str):
+            config = AutoConfig.from_pretrained(model_path_or_config, trust_remote_code=True, **kwargs)
+        elif isinstance(model_path_or_config, ApolloConfig):
+            config = model_path_or_config
+        else:
+            raise NotImplementedError(f"wrong type, {type(model_path_or_config)} \
+                                      {isinstance(model_path_or_config, ApolloConfig)}")
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
+            config.model_dtype = model_dtype
+        with ContextManagers([no_init_weights(_enable=True), ]):
+            vlm = cls(config, *args, **kwargs)
+        if hasattr(vlm, "llm") and hasattr(vlm, "vision_tower") and hasattr(vlm, "mm_connector"):
+            if vlm.is_loaded:
+                return vlm
+            else:
+                print('loading model failed!')
+        else:
+            print('loading model failed!')
+    def _encode_mm(self, x):
+        x = self.get_vision_tower()(x)
+        x = self.mm_connector(x)
+        return x
+    def encode_mm_minibatch(self, x):
+        split_sizes = [x_s[0].shape[0] for x_s in x]
+        x = [torch.split(torch.cat([x_s[i] for x_s in x], dim=0), self.config.encode_batch_size) for i in
+             range(self.get_vision_tower().num_vision_encoders)]
+        swapped_x = []
+        for i in range(len(x[0])):
+            swapped_x.append([x_s[i] for x_s in x])
+        features = []
+        for xx in swapped_x:
+            xx = self._encode_mm(xx)
+            features.append(xx)
+        x = torch.cat(features, dim=0)
+        x = torch.split(x, split_sizes, dim=0)
+        return [xx.contiguous().view(-1, xx.shape[2]) for xx in x]
+    def prepare_inputs_labels_for_multimodal(
+            self, input_ids, position_ids, attention_mask, past_key_values, labels, vision_input, data_types
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or vision_input is None or input_ids.shape[1] == 1:
+            if (
+                    past_key_values is not None
+                    and vision_tower is not None
+                    and vision_input is not None
+                    and input_ids.shape[1] == 1
+            ):
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat(
+                    (
+                        attention_mask,
+                        torch.ones(
+                            (
+                                attention_mask.shape[0],
+                                target_shape - attention_mask.shape[1],
+                            ),
+                            dtype=attention_mask.dtype,
+                            device=attention_mask.device,
+                        ),
+                    ),
+                    dim=1,
+                )
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+            )
+        '''
+            vision_input is a list of tuples, and data_type is a list of strings:
+            data_type = ['image', 'video', 'video'..., 'text']
+            (for one video and two image encoders)
+            vision_input =
+            [
+                [image(1, T, C, H, W), image(1, T, C, H, W), image(1, T, C, H, W)],
+                [video(Nc1, C, T, H, W), video(Nc1, T, C, H, W), video(Nc1, T, C, H, W)],
+                [video(Nc2, C, T, H, W), video(Nc2, T, C, H, W), video(Nc2, T, C, H, W)],
+            ]
+            -> video encoders typlically expect (C,T,H,W), images expect (C,H,W).
+        '''
+        # ====================================================================================================
+        merged_mm_features = self.encode_mm_minibatch(vision_input)
+        if not getattr(self.config, "tune_language_model", True) and getattr(self.config, "use_mm_start_end", False):
+            raise NotImplementedError
+        # ====================================================================================================
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask
+        input_ids_copy = input_ids.clone()
+        # kentang-mit@: Otherwise tokenizer out of bounds. Embeddings of image tokens will not be used.
+        input_ids_copy[input_ids_copy == X_TOKEN_INDEX] = 0
+        input_embeds = self.get_llm().model.embed_tokens(input_ids_copy)
+        input_ids = [
+            cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        input_embeds_1 = [
+            cur_input_embeds[cur_attention_mask]
+            for cur_input_embeds, cur_attention_mask in zip(input_embeds, attention_mask)
+        ]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        # input_ids, new_input_embeds = self.inputs_merger(input_ids, input_embeds_1, merged_mm_features)
+        new_labels = []
+        new_input_embeds = []
+        # print("BEFORE BATCH LOOP:", len(input_ids), input_ids[0].shape, input_ids[0].device, [(x == X_TOKEN_INDEX).sum() for x in input_ids])
+        # kentang-mit@: If some part of the model is executed in the loop, the the loop length needs to be a constant.
+        for batch_idx, (cur_labels, cur_input_ids, mm_features) in enumerate(
+                zip(labels, input_ids, merged_mm_features)):
+            cur_input_ids = input_ids[batch_idx]
+            num_mm = (cur_input_ids == X_TOKEN_INDEX).sum()
+            if num_mm == 0:
+                cur_input_embeds_1 = input_embeds_1[batch_idx]
+                cur_input_embeds = torch.cat([cur_input_embeds_1, mm_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(cur_labels)
+                # kenang-mit@: we do not have placeholdr image for text-only data now.
+                continue
+            if mm_features.shape[0] != num_mm:
+                print(data_types[batch_idx])
+                assert num_mm == len(
+                    mm_features), f'Error in {data_types[batch_idx]}{num_mm}=/={len(mm_features)} not the same number of vision tokens in and vision embeddings!'
+            cur_input_embeds = input_embeds_1[batch_idx]
+            image_token_indices = (
+                    [-1] + torch.where(cur_input_ids == X_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            cur_input_embeds_no_im = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1: image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1: image_token_indices[i + 1]])
+                cur_input_embeds_no_im.append(cur_input_embeds[image_token_indices[i] + 1: image_token_indices[i + 1]])
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_mm + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                # print("cur_new_input_embeds1", cur_new_input_embeds.shape[-1])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_mm:
+                    cur_image_features = mm_features[i:i + 1]
+                    cur_new_input_embeds.append(cur_image_features)
+                    # print("cur_new_input_embeds2", cur_new_input_embeds.shape[-1])
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.get_llm().config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            if any(len(x) > tokenizer_model_max_length for x in new_input_embeds):
+                priny("Inputs truncated!")
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.get_llm().config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+        )

vision_tower.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import torch, os, PIL, numbers
+from PIL import Image
+import cv2
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from transformers import AutoConfig, AutoModel, SiglipImageProcessor, SiglipVisionConfig, PretrainedConfig
+from typing import Union
+import torch.nn.functional as F
+import numpy as np
+def crop_clip(clip, min_h, min_w, h, w):
+    if isinstance(clip[0], np.ndarray):
+        cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]
+    elif isinstance(clip[0], PIL.Image.Image):
+        cropped = [
+            img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
+        ]
+    else:
+        raise TypeError('Expected numpy.ndarray or PIL.Image' +
+                        'but got list of {0}'.format(type(clip[0])))
+    return cropped
+class Normalize(object):
+    """Normalize a clip with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts out of place, i.e., it does not mutates the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, clip):
+        """
+        Args:
+            clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor clip.
+        """
+        return normalize(clip, self.mean, self.std)
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
+class CenterCrop(object):
+    """Extract center crop at the same location for a list of images
+    Args:
+    size (sequence or int): Desired output size for the
+    crop in format (h, w)
+    """
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            size = (size, size)
+        self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+        img (PIL.Image or numpy.ndarray): List of images to be cropped
+        in format (h, w, c) in numpy.ndarray
+        Returns:
+        PIL.Image or numpy.ndarray: Cropped list of images
+        """
+        h, w = self.size
+        if isinstance(clip[0], np.ndarray):
+            im_h, im_w, im_c = clip[0].shape
+        elif isinstance(clip[0], PIL.Image.Image):
+            im_w, im_h = clip[0].size
+        else:
+            raise TypeError('Expected numpy.ndarray or PIL.Image' +
+                            'but got list of {0}'.format(type(clip[0])))
+        if w > im_w or h > im_h:
+            error_msg = (
+                'Initial image size should be larger then '
+                'cropped size but got cropped sizes : ({w}, {h}) while '
+                'initial image is ({im_w}, {im_h})'.format(
+                    im_w=im_w, im_h=im_h, w=w, h=h))
+            raise ValueError(error_msg)
+        x1 = int(round((im_w - w) / 2.))
+        y1 = int(round((im_h - h) / 2.))
+        cropped = crop_clip(clip, y1, x1, h, w)
+        return cropped
+def resize_clip(clip, size, interpolation='bilinear'):
+    if isinstance(clip[0], np.ndarray):
+        if isinstance(size, numbers.Number):
+            im_h, im_w, im_c = clip[0].shape
+            # Min spatial dim already matches minimal size
+            if (im_w <= im_h and im_w == size) or (im_h <= im_w
+                                                   and im_h == size):
+                return clip
+            new_h, new_w = get_resize_sizes(im_h, im_w, size)
+            size = (new_w, new_h)
+        else:
+            size = size[0], size[1]
+        if interpolation == 'bilinear':
+            np_inter = cv2.INTER_LINEAR
+        else:
+            np_inter = cv2.INTER_NEAREST
+        scaled = [
+            cv2.resize(img, size, interpolation=np_inter) for img in clip
+        ]
+    elif isinstance(clip[0], PIL.Image.Image):
+        if isinstance(size, numbers.Number):
+            im_w, im_h = clip[0].size
+            # Min spatial dim already matches minimal size
+            if (im_w <= im_h and im_w == size) or (im_h <= im_w
+                                                   and im_h == size):
+                return clip
+            new_h, new_w = get_resize_sizes(im_h, im_w, size)
+            size = (new_w, new_h)
+        else:
+            size = size[1], size[0]
+        if interpolation == 'bilinear':
+            pil_inter = PIL.Image.BILINEAR
+        else:
+            pil_inter = PIL.Image.NEAREST
+        scaled = [img.resize(size, pil_inter) for img in clip]
+    else:
+        raise TypeError('Expected numpy.ndarray or PIL.Image' +
+                        'but got list of {0}'.format(type(clip[0])))
+    return scaled
+def _is_tensor_clip(clip):
+    return torch.is_tensor(clip) and clip.ndimension() == 4
+def get_resize_sizes(im_h, im_w, size):
+    if im_w < im_h:
+        ow = size
+        oh = int(size * im_h / im_w)
+    else:
+        oh = size
+        ow = int(size * im_w / im_h)
+    return oh, ow
+def normalize(clip, mean, std, inplace=False):
+    if not _is_tensor_clip(clip):
+        raise TypeError('tensor is not a torch clip.')
+    if not inplace:
+        clip = clip.clone()
+    dtype = clip.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
+    std = torch.as_tensor(std, dtype=dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+class Resize(object):
+    """Resizes a list of (H x W x C) numpy.ndarray to the final size
+    The larger the original image is, the more times it takes to
+    interpolate
+    Args:
+    interpolation (str): Can be one of 'nearest', 'bilinear'
+    defaults to nearest
+    size (tuple): (widht, height)
+    """
+    def __init__(self, size, interpolation='nearest'):
+        self.size = size
+        self.interpolation = interpolation
+    def __call__(self, clip):
+        resized = resize_clip(
+            clip, self.size, interpolation=self.interpolation)
+        return resized
+class Compose(object):
+    """Composes several transforms
+    Args:
+    transforms (list of ``Transform`` objects): list of transforms
+    to compose
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, clip):
+        for t in self.transforms:
+            clip = t(clip)
+        return clip
+def convert_img(img):
+    """Converts (H, W, C) numpy.ndarray to (C, W, H) format"""
+    if len(img.shape) == 3:
+        img = img.transpose(2, 0, 1)
+    if len(img.shape) == 2:
+        img = np.expand_dims(img, 0)
+    return img
+class ClipToTensor(object):
+    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
+    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
+    """
+    def __init__(self, channel_nb=3, div_255=True, numpy=False):
+        self.channel_nb = channel_nb
+        self.div_255 = div_255
+        self.numpy = numpy
+    def __call__(self, clip):
+        """
+        Args: clip (list of numpy.ndarray): clip (list of images)
+        to be converted to tensor.
+        """
+        # Retrieve shape
+        if isinstance(clip[0], np.ndarray):
+            h, w, ch = clip[0].shape
+            assert ch == self.channel_nb, "Got {0} instead of 3 channels".format(ch)
+        elif isinstance(clip[0], Image.Image):
+            w, h = clip[0].size
+        else:
+            raise TypeError(
+                "Expected numpy.ndarray or PIL.Image\
+            but got list of {0}".format(
+                    type(clip[0])
+                )
+            )
+        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])
+        # Convert
+        for img_idx, img in enumerate(clip):
+            if isinstance(img, np.ndarray):
+                pass
+            elif isinstance(img, Image.Image):
+                img = np.array(img, copy=False)
+            else:
+                raise TypeError(
+                    "Expected numpy.ndarray or PIL.Image\
+                but got list of {0}".format(
+                        type(clip[0])
+                    )
+                )
+            img = convert_img(img)
+            np_clip[:, img_idx, :, :] = img
+        if self.numpy:
+            if self.div_255:
+                np_clip = np_clip / 255.0
+            return np_clip
+        else:
+            tensor_clip = torch.from_numpy(np_clip)
+            if not isinstance(tensor_clip, torch.FloatTensor):
+                tensor_clip = tensor_clip.float()
+            if self.div_255:
+                tensor_clip = torch.div(tensor_clip, 255)
+            return tensor_clip
+class VisionTowerConfig(PretrainedConfig):
+    model_type = "vision_tower"
+    def __init__(self, vision_tower_name: str = None, **kwargs):
+        super().__init__()
+        self.vision_tower_name = vision_tower_name
+class ProcessorWrapper:
+    def __init__(self, transform=None, processor=None, height=378, width=378, frames_per_clip=1,
+                 image_mean=[0.48145466, 0.4578275, 0.40821073]):
+        assert transform is not None or processor is not None, "ERROR: you did not define both `transform` and `processor`! You must define either transform or processor"
+        assert transform is None or processor is None, "ERROR: you did defined both `transform` and `processor`! You must define only one of: transform or processor"
+        self._size = {
+            "height": height,
+            "width": width,
+            "frames_per_clip": frames_per_clip
+        }
+        self._transforms = transform
+        self._processor = processor
+        self.image_mean = image_mean
+    @property
+    def size(self):
+        return self._size
+    def preprocess(self, image, return_tensors='pt'):
+        # Ensure image is a PIL Image
+        output = {}
+        if self._transforms is not None:
+            output['pixel_values'] = [self._transforms(image)]
+        else:
+            output = self._processor(image, return_tensors='pt')
+        return output
+    def save_pretrained(self, save_path):
+        if self._transforms is not None:
+            transform_dict = transform_to_dict(self._transforms)
+            transform_dict["image_processor_type"] = "transforms"
+            with open(os.path.join(save_path, 'preprocessor_config.json'), 'w') as f:
+                json.dump(transform_dict, f, indent=4)
+        else:
+            self._processor.save_pretrained(save_path)
+        return
+class VisionTower(PreTrainedModel):
+    config_class = VisionTowerConfig
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: VisionTowerConfig = None):
+        super().__init__(vision_config)
+        self.vision_tower_name = model_name_or_path
+        self.vision_config = vision_config
+        self.select_layer = getattr(config, "mm_vision_select_layer", -2)
+        self.select_feature = getattr(config, "mm_vision_select_feature", "patch")
+        self.encode_batch_size = getattr(config, "encode_batch_size", 0) // 2
+        self.num_encode_batch = getattr(config, "num_encode_batch", 0) // 2
+        self.temporal_tubelet_size = getattr(vision_config, "tubelet_size", 1)
+    def feature_select(self, image_features):
+        if self.select_layer is not None:
+            image_features = image_features.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def vision_tower_forward(self, image):
+        image_feature = self.vision_tower(image, output_hidden_states=True)
+        return image_feature
+    def _forward(self, images, out_T=1):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.vision_tower_forward(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = self.feature_select(image_feature).to(image.dtype)
+                image_feature = image_features.reshape(image_feature.shape[0], self.W, self.H, self.D)
+                image_features.append(image_feature)
+        else:
+            original_shape = images.shape
+            if len(original_shape) == 5 and self.T == 1:
+                # downsample temporally if needed, and reshape from (B, T, C, W, H) to (B*T, C, W, H).
+                images = images[:, ::original_shape[1] // out_T, ...]
+                original_shape = images.shape
+                images = images.view(-1, *original_shape[2:])
+            image_features = self.vision_tower_forward(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_features).to(images.dtype)
+            # Reshape back to (B, T, ...) if necessary
+            if len(original_shape) == 5 and self.T == 1:
+                # Assuming the feature dimension does not change, adapt the following line if it does
+                new_shape = list(image_features.shape[:-2]) + [self.W, self.H, self.hidden_size]
+                image_features = image_features.reshape(new_shape)
+                feature_size = image_features.shape[1:]
+                image_features = image_features.view(original_shape[0], original_shape[1], *feature_size)
+            else:
+                image_features = image_features.reshape(image_features.shape[0], self.T, self.W, self.H, self.hidden_size)
+        return image_features
+    def forward(self, images):
+        return self._forward(images)
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class InternVideoTower(VisionTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: PretrainedConfig = None):
+        if vision_config is None:
+            vision_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+        super().__init__(model_name_or_path, config, vision_config)
+        self.vision_config = vision_config
+        normalize = ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+        print('loading: ', model_name_or_path)
+        model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
+        self.vision_tower = model.to(dtype=eval(config.model_dtype))
+        transform = Compose([
+            Resize(self.vision_config.img_size, interpolation='bilinear'),
+            CenterCrop(size=(self.vision_config.img_size, self.vision_config.img_size)),
+            ClipToTensor(),
+            Normalize(mean=normalize[0], std=normalize[1])
+        ])
+        self.vision_processor = ProcessorWrapper(transform=transform,
+                                                 height=self.vision_config.img_size,
+                                                 width=self.vision_config.img_size,
+                                                 frames_per_clip=self.vision_config.num_frames,
+                                                 image_mean=normalize[0])
+        self.W = self.H = vision_config.img_size // vision_config.patch_size
+        self.T = self.vision_config.num_frames // self.vision_config.tubelet_size
+        self.num_frames = self.vision_config.num_frames
+        self.hidden_size = vision_config.d_model
+        self.vision_select_layer=self.select_layer
+        self.select_layer=None
+    def vision_tower_forward(self, video):
+        if video.shape[-3] < self.num_frames:
+            video = video.repeat_interleave(self.num_frames, dim=-3)
+        elif video.shape[-3] > self.num_frames:
+            video = video[:, :, ::video.shape[-3] // self.num_frames, ...]
+        video_feature = self.vision_tower(video.to(device=self.device, dtype=self.dtype),
+                                          x_vis_return_idx=self.vision_select_layer, x_vis_only=True)
+        return video_feature
+    @property
+    def device(self):
+        return self.vision_tower.pos_embed.device
+class SiglipVisionTower(VisionTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: PretrainedConfig = None):
+        if vision_config is None:
+            vision_config = SiglipVisionConfig.from_pretrained(model_name_or_path)
+        super().__init__(model_name_or_path, config, vision_config)
+        self.vision_config = vision_config
+        self.vision_tower_name = model_name_or_path
+        self.vision_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        print('loading: ', model_name_or_path)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        self.hidden_size = self.vision_config.hidden_size
+        self.W = self.H = self.vision_config.image_size // self.vision_config.patch_size
+        self.T = 1
+        self.select_feature = "cls_patch"
+class ApolloVisionTower(PreTrainedModel):
+    def __init__(self, config, vision_tower_cfg):
+        super(ApolloVisionTower, self).__init__(config, vision_tower_cfg)
+        self.model_name_or_path = vision_tower_cfg._name_or_path
+        self.vision_towers = vision_tower_cfg.vision_towers
+        self._config = vision_tower_cfg
+        for vision_tower_name in self.vision_towers:
+            if 'internvideo' in vision_tower_name.lower():
+                vision_tower = InternVideoTower(os.path.join(vision_tower_cfg._name_or_path, vision_tower_name), config)
+            elif 'siglip' in vision_tower_name.lower():
+                vision_tower = SiglipVisionTower(os.path.join(vision_tower_cfg._name_or_path, vision_tower_name),
+                                                 config)
+            setattr(self, vision_tower_name, vision_tower)
+        self.vision_processor = [getattr(self, vt).vision_processor for vt in self.vision_towers]
+        self.num_vision_encoders = len(self.vision_towers)
+        self.W = self.H = max([getattr(self, vt).W for vt in self.vision_towers])
+        self.T = max([getattr(self, vt).T for vt in self.vision_towers])
+        self.max_tubelet_size = max(
+            [getattr(getattr(self, vt).vision_config, 'tubelet_size', 1) for vt in self.vision_towers])
+        self._hidden_size = sum([getattr(self, vt).hidden_size for vt in self.vision_towers])
+        self.token_output_shape = (self.T, self.W, self.H)
+        self.config.num_vision_encoders = self.num_vision_encoders
+        self.config.vision_towers = self.vision_towers
+        self.config.token_output_shape = self.token_output_shape
+    def forward(self, x):
+        output_features = []
+        for x_s, vision_tower_name in zip(x, self.vision_towers):
+            vision_tower = getattr(self, vision_tower_name)
+            features = vision_tower._forward(x_s, out_T=self.T)
+            if len(features.shape) != len(self.token_output_shape) + 2:
+                features = features.unsqueeze(1)
+            if features.shape[-len(self.token_output_shape) - 1:-1] != self.token_output_shape:
+                features = features.permute(0, 4, 1, 2, 3).contiguous()  # shape [B, D, T, W, H]
+                features = F.interpolate(features.to(torch.float32), size=self.token_output_shape, mode='trilinear',
+                                         align_corners=False).to(features.dtype)
+                features = features.permute(0, 2, 3, 4, 1).contiguous()
+            output_features.append(features)
+        output_features = torch.cat(output_features, dim=-1)
+        output_features = torch.flatten(output_features, start_dim=1, end_dim=-2)
+        return output_features
+    def save_pretrained(
+            self,
+            save_directory: Union[str, os.PathLike],
+            state_dict=None,
+            **kwargs,
+    ):
+        if state_dict is None:
+            state_dict = self.state_dict()
+        for vision_tower_name in self.vision_towers:
+            vision_tower = getattr(self, vision_tower_name)
+            vision_tower_state_dict = OrderedDict(
+                {k.split(f"vision_tower.{vision_tower_name}.vision_tower.")[-1]: v for k, v in state_dict.items() if
+                 vision_tower_name in k}
+            )
+            vision_tower.vision_tower.save_pretrained(os.path.join(save_directory, vision_tower_name),
+                                                      state_dict=vision_tower_state_dict, **kwargs)
+            vision_tower.vision_processor.save_pretrained(os.path.join(save_directory, vision_tower_name))
+        config = self.config
+        config.configs = {}
+        config.save_pretrained(save_directory)
+    @property
+    def patch_size(self):
+        return self._patch_size
+    @property
+    def image_size(self):
+        return self._image_size
+    @property
+    def hidden_size(self):
+        return self._hidden_size

vision_tower/config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "configs": {},
+  "model_type": "hybrid_vision_tower",
+  "num_vision_encoders": 2,
+  "token_output_shape": [
+    4,
+    27,
+    27
+  ],
+  "transformers_version": "4.44.0",
+  "vision_towers": [
+    "siglip-so400m-patch14-384",
+    "internvideo2"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hybrid.HybridTowerConfig"
+  }
+}

vision_tower/configuration_hybrid.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+import os
+import torch.nn.functional as F
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig
+from collections import OrderedDict
+class HybridTowerConfig(PretrainedConfig):
+    model_type = "hybrid_vision_tower"
+    def __init__(self, configs=None, **kwargs):
+        """
+        Initializes the HybridTowerConfig.
+        Args:
+            configs (dict, optional): A dictionary where keys are component names and values are
+                                      instances of configurations that have a `to_dict()` method.
+            **kwargs: Additional keyword arguments that are passed to the superclass.
+        """
+        super().__init__(**kwargs)
+        self.configs = {}
+        if configs is not None:
+            if not isinstance(configs, dict):
+                raise TypeError("configs must be a dictionary where keys are component names and values are configuration objects.")
+            for component_name, config in configs.items():
+                if hasattr(config, 'to_dict'):
+                    self.configs[component_name] = config.to_dict()
+                else:
+                    raise TypeError(f"The configuration for '{component_name}' does not have a to_dict() method and cannot be serialized.")
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+        Returns:
+            dict: A dictionary containing all the keys and values of this configuration instance.
+        """
+        config_dict = super().to_dict()
+        config_dict['configs'] = self.configs
+        return config_dict

vision_tower/internvideo2/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "architectures": [
+    "PretrainInternVideo2"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "attn_pool_num_heads": 16,
+  "checkpoint_num": 40,
+  "clip_embed_dim": 768,
+  "clip_input_resolution": 224,
+  "clip_norm_type": "l2",
+  "clip_return_layer": 6,
+  "clip_student_return_interval": 1,
+  "clip_teacher": null,
+  "clip_teacher_embed_dim": 3200,
+  "clip_teacher_final_dim": 768,
+  "clip_teacher_return_interval": 1,
+  "d_model": 1408,
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "image_mask_ratio": 0.5,
+  "image_mask_type": "random",
+  "image_size": 224,
+  "img_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "keep_temporal": false,
+  "layer_norm_eps": 1e-12,
+  "model_type": "internvideo2",
+  "name": "pretrain_internvideo2_1b_patch14_224",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_frames": 4,
+  "num_heads": 16,
+  "num_hidden_layers": 12,
+  "only_mask": true,
+  "patch_size": 14,
+  "qkv_bias": false,
+  "sep_image_video_pos_embed": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
+  "tubelet_size": 1,
+  "use_checkpoint": true,
+  "use_flash_attn": false,
+  "use_fused_mlp": false,
+  "use_fused_rmsnorm": false,
+  "video_mask_ratio": 0.8,
+  "video_mask_type": "random",
+  "auto_map": {
+    "AutoConfig": "configuration_internvideo2.InternVideo2Config",
+    "AutoModel": "modeling_internvideo2.InternVideo2Model"
+  }
+}

vision_tower/internvideo2/configuration_internvideo2.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from transformers import PretrainedConfig
+class InternVideo2Config(PretrainedConfig):
+    model_type = "internvideo2"
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=14,
+        tubelet_size=1,
+        num_frames=8,
+        d_model=1408,
+        num_heads=16,
+        depth=40,
+        mlp_ratio=48 / 11,
+        qkv_bias=False,
+        init_values=1e-5,
+        use_checkpoint=False,
+        checkpoint_num=0,
+        use_flash_attn=False,
+        use_fused_mlp=False,
+        use_fused_rmsnorm=False,
+        qk_normalization=True,
+        clip_embed_dim=1408,
+        attn_pool_num_heads=16,
+        clip_teacher_embed_dim=512,
+        clip_teacher_final_dim=512,
+        clip_student_return_interval=4,
+        clip_return_layer=3,
+        clip_norm_type="l2",
+        sep_image_video_pos_embed=False,
+        **kwargs,
+    ):
+        """
+        This is the configuration class to store the configuration of a `InternVideo2Model`.
+        It is used to instantiate a InternVideo2 model according to the specified arguments,
+        defining the model architecture.
+        Args:
+            img_size (int, optional): Input image size. Defaults to 224.
+            patch_size (int, optional): Size of each patch. Defaults to 14.
+            tubelet_size (int, optional): Temporal tubelet size. Defaults to 1.
+            num_frames (int, optional): Number of frames in the video input. Defaults to 8.
+            d_model (int, optional): Dimension of the model embeddings. Defaults to 1408.
+            num_heads (int, optional): Number of attention heads. Defaults to 16.
+            depth (int, optional): Number of transformer encoder layers. Defaults to 40.
+            mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11.
+            qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False.
+            init_values (float, optional): Initial values for layer scale. Defaults to 1e-5.
+            use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False.
+            checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0.
+            use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False.
+            use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False.
+            use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False.
+            qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True.
+            clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408.
+            attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16.
+            clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512.
+            clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512.
+            clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4.
+            clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3.
+            clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'.
+            sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False.
+            **kwargs: Additional keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.tubelet_size = tubelet_size
+        self.num_frames = num_frames
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.init_values = init_values
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        self.use_flash_attn = use_flash_attn
+        self.use_fused_mlp = use_fused_mlp
+        self.use_fused_rmsnorm = use_fused_rmsnorm
+        self.qk_normalization = qk_normalization
+        self.clip_embed_dim = clip_embed_dim
+        self.attn_pool_num_heads = attn_pool_num_heads
+        self.clip_teacher_embed_dim = clip_teacher_embed_dim
+        self.clip_teacher_final_dim = clip_teacher_final_dim
+        self.clip_student_return_interval = clip_student_return_interval
+        self.clip_return_layer = clip_return_layer
+        self.clip_norm_type = clip_norm_type
+        self.sep_image_video_pos_embed = sep_image_video_pos_embed

vision_tower/internvideo2/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc98ff193aca2992ed2f56fea01c4d7be2e2c737cf9fcc5a73ef31663e728624
+size 2098289968

vision_tower/internvideo2/modeling_internvideo2.py ADDED Viewed

	@@ -0,0 +1,934 @@

+# modeling_internvideo2.py
+import logging
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.utils import logging as hf_logging
+from torch.utils.checkpoint import checkpoint  # Correct
+from functools import partial
+from .configuration_internvideo2 import InternVideo2Config  # Import the configuration
+try:
+    from einops import rearrange
+except ImportError:
+    raise ImportError("Please install einops to use this model.")
+try:
+    from timm.models.layers import DropPath, to_2tuple
+except ImportError:
+    raise ImportError("Please install timm to use this model.")
+logger = hf_logging.get_logger(__name__)
+# Position embedding functions
+def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
+    assert embed_dim % 4 == 0
+    embed_dim_spatial = embed_dim // 4 * 3
+    embed_dim_temporal = embed_dim // 4
+    # Spatial
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # W first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
+    # Temporal
+    grid_t = np.arange(t_size, dtype=np.float32)
+    pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)
+    # Combine spatial and temporal embeddings
+    pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
+    pos_embed_temporal = np.repeat(pos_embed_temporal, grid_size**2, axis=1)
+    pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
+    pos_embed_spatial = np.repeat(pos_embed_spatial, t_size, axis=0)
+    pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
+    pos_embed = pos_embed.reshape([-1, embed_dim])
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
+    emb = np.concatenate([emb_h, emb_w], axis=1)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / (10000 ** omega)
+    pos = pos.reshape(-1)
+    out = np.einsum('m,d->md', pos, omega)
+    emb_sin = np.sin(out)
+    emb_cos = np.cos(out)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)
+    return emb
+# Define necessary classes: CrossAttention, AttentiveBlock, AttentionPoolingBlock, RMSNorm, LayerScale, Attention, Mlp, Block, PatchEmbed, Linear_Decoder
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        attn_head_dim=None,
+        out_dim=None,
+    ):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = (
+            q.reshape(B, N, 1, self.num_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+            .squeeze(0)
+        )  # (B, N_head, N_q, dim)
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = (
+            k.reshape(B, N_k, 1, self.num_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+            .squeeze(0)
+        )
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = (
+            v.reshape(B, N_v, 1, self.num_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+            .squeeze(0)
+        )
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)  # (B, N_head, N_q, N_k)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentiveBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        attn_head_dim=None,
+        out_dim=None,
+    ):
+        super().__init__()
+        self.norm1_q = norm_layer(dim)
+        self.norm1_k = norm_layer(dim)
+        self.norm1_v = norm_layer(dim)
+        self.cross_attn = CrossAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            attn_head_dim=attn_head_dim,
+            out_dim=out_dim,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(
+        self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None
+    ):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_attn(x_q, k=x_k, v=x_v)
+        return x
+class AttentionPoolingBlock(AttentiveBlock):
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv, pos_q, pos_k = x, 0, 0
+        x = super().forward(
+            x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None
+        )
+        x = x.squeeze(1)
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(
+            variance + self.variance_epsilon
+        )
+        return self.weight * hidden_states.to(input_dtype)
+class LayerScale(nn.Module):
+    def __init__(
+        self, dim, init_values=1e-5, inplace=False, force_fp32=False
+    ):
+        super().__init__()
+        self.inplace = inplace
+        self.weight = nn.Parameter(init_values * torch.ones(dim))
+        self.force_fp32 = force_fp32
+    @torch.cuda.amp.autocast(enabled=False)
+    def forward(self, x):
+        if self.force_fp32:
+            output_type = x.dtype
+            out = (
+                x.float().mul_(self.weight.float())
+                if self.inplace
+                else x.float() * self.weight.float()
+            )
+            return out.to(dtype=output_type)
+        else:
+            out = x.mul_(self.weight) if self.inplace else x * self.weight
+            return out
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        use_flash_attn=False,
+        causal=False,
+        norm_layer=nn.LayerNorm,
+        qk_normalization=False,
+        use_fused_rmsnorm=False,
+    ):
+        super().__init__()
+        assert (
+            dim % num_heads == 0
+        ), "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_flash_attn = use_flash_attn
+        if use_flash_attn:
+            self.causal = causal
+            try:
+                from flash_attn.flash_attention import FlashAttention
+                self.inner_attn = FlashAttention(
+                    attention_dropout=attn_drop
+                )
+            except ImportError:
+                raise ImportError(
+                    "Please install flash_attn to use flash attention."
+                )
+        self.qk_normalization = qk_normalization
+        self.q_norm = norm_layer(dim) if qk_normalization else nn.Identity()
+        self.k_norm = norm_layer(dim) if qk_normalization else nn.Identity()
+        self.use_fused_rmsnorm = use_fused_rmsnorm
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        # print(x.shape, torch.cuda.memory_allocated(), torch.cuda.memory_allocated())
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(
+            0
+        )  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = (
+                self.q_norm(q.transpose(1, 2).flatten(-2, -1))
+                .view(B_, N_, H_, D_)
+                .transpose(1, 2)
+            )
+            k = (
+                self.k_norm(k.transpose(1, 2).flatten(-2, -1))
+                .view(B_, N_, H_, D_)
+                .transpose(1, 2)
+            )
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        # attn = attn - attn.max(-1)[0].unsqueeze(-1)  # in case of overflow for fp16
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        # print(torch.cuda.memory_allocated(), torch.cuda.memory_allocated())
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(
+        self, x, key_padding_mask=None, need_weights=False
+    ):
+        qkv = self.qkv(x)
+        qkv = rearrange(
+            qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads
+        )
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            if self.use_fused_rmsnorm:
+                q = self.q_norm(q.flatten(-2, -1))[0].view(q.shape)
+                k = self.k_norm(k.flatten(-2, -1))[0].view(k.shape)
+            else:
+                q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+                k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            causal=self.causal,
+        )
+        outs = self.proj(rearrange(context, "b s h d -> b s (h d)"))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, x):
+        x = (
+            self._naive_attn(x)
+            if not self.use_flash_attn
+            else self._flash_attn(x)
+        )
+        return x
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        bias=True,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        init_values=None,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        use_flash_attn=False,
+        use_fused_mlp=False,
+        fused_mlp_heuristic=1,
+        with_cp=False,
+        qk_normalization=False,
+        layerscale_no_force_fp32=False,
+        use_fused_rmsnorm=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_flash_attn=use_flash_attn,
+            causal=False,
+            norm_layer=norm_layer,
+            qk_normalization=qk_normalization,
+            use_fused_rmsnorm=use_fused_rmsnorm,
+        )
+        self.ls1 = (
+            LayerScale(
+                dim,
+                init_values=init_values,
+                force_fp32=(not layerscale_no_force_fp32),
+            )
+            if init_values
+            else nn.Identity()
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        )
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if use_fused_mlp:
+            try:
+                from flash_attn.modules.mlp import FusedMLP
+            except ImportError:
+                raise ImportError(
+                    "Please install flash_attn to use fused MLP."
+                )
+            self.mlp = FusedMLP(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                heuristic=fused_mlp_heuristic,
+            )
+        else:
+            self.mlp = Mlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                drop=drop,
+            )
+        self.ls2 = (
+            LayerScale(
+                dim,
+                init_values=init_values,
+                force_fp32=(not layerscale_no_force_fp32),
+            )
+            if init_values
+            else nn.Identity()
+        )
+        self.drop_path2 = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        )
+        self.with_cp = with_cp
+        self.use_fused_rmsnorm = use_fused_rmsnorm
+    def forward(self, x, residual=None):
+        def _inner_forward(x, residual=None):
+            if self.use_fused_rmsnorm:
+                x, residual = self.norm1(x, residual)
+                x = self.drop_path1(self.ls1(self.attn(x)))
+                x, residual = self.norm2(x, residual)
+                x = self.drop_path2(self.ls2(self.mlp(x)))
+                return x, residual
+            else:
+                assert residual is None
+                x = x + self.drop_path1(
+                    self.ls1(self.attn(self.norm1(x)))
+                )
+                x = x + self.drop_path2(
+                    self.ls2(self.mlp(self.norm2(x)))
+                )
+                return x
+        if self.with_cp:
+            return checkpoint(_inner_forward, x, residual)
+        else:
+            return _inner_forward(x, residual=residual)
+class PatchEmbed(nn.Module):
+    """3D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        num_frames=8,
+        tubelet_size=1,
+        norm_layer=None,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (
+            num_frames // tubelet_size,
+            img_size[0] // patch_size[0],
+            img_size[1] // patch_size[1],
+        )  # (T, H, W)
+        self.num_patches = (
+            self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        )
+        self.num_img_patches = self.grid_size[1] * self.grid_size[2]
+        self.proj = nn.Conv3d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=(tubelet_size, patch_size[0], patch_size[1]),
+            stride=(tubelet_size, patch_size[0], patch_size[1]),
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        x = (
+            x.flatten(3)
+            .permute(0, 2, 3, 1)
+        )  # B x C x T x HW => B x T x HW x C
+        x = self.norm(x)
+        return x
+class Linear_Decoder(nn.Module):
+    def __init__(self, in_channels=1408, out_channels=3200, norm_layer=nn.LayerNorm, clip_norm_type='l2'):
+        super().__init__()
+        self.clip_norm_type = clip_norm_type
+        logger.info(f'Normalization Type: {clip_norm_type}')
+        self.head = nn.Linear(in_channels, out_channels)
+        self.norm = norm_layer(out_channels)
+    def forward(self, x):
+        x = self.norm(self.head(x))
+        if self.clip_norm_type == 'l2':
+            x = x / x.norm(dim=-1, keepdim=True)
+        elif self.clip_norm_type == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return x
+class InternVideo2Model(PreTrainedModel):
+    config_class = InternVideo2Config
+    base_model_prefix = "internvideo2"
+    def __init__(self, config: InternVideo2Config):
+        super().__init__(config)
+        in_chans = 3
+        drop_path_rate = 0.25
+        qk_normalization = config.qk_normalization
+        clip_embed_dim = config.clip_embed_dim
+        num_heads = config.num_heads
+        qkv_bias = config.qkv_bias
+        init_values = config.init_values
+        mlp_ratio = config.mlp_ratio
+        depth = config.depth
+        num_frames = config.num_frames
+        self.num_frames = num_frames
+        self.tubelet_size = config.tubelet_size
+        use_fused_mlp = config.use_fused_mlp
+        use_fused_rmsnorm = config.use_fused_rmsnorm
+        use_flash_attn = config.use_flash_attn
+        assert (
+            use_flash_attn
+            == use_fused_rmsnorm
+            == use_fused_mlp
+        ), "use_flash_attn, use_fused_rmsnorm and use_fused_mlp should be consistent"
+        self.use_flash_attn = use_flash_attn
+        embed_dim = config.d_model
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.clip_norm_type = config.clip_norm_type
+        self.return_index = []
+        for i in range(config.clip_return_layer):
+            self.return_index.append(
+                depth - int(i * config.clip_student_return_interval) - 1
+            )
+        logger.info(f"Normalization Type: {config.clip_norm_type}")
+        logger.info(f"Student Return Index: {self.return_index}")
+        if use_fused_rmsnorm:
+            try:
+                from flash_attn.ops.rms_norm import DropoutAddRMSNorm
+            except ImportError:
+                raise ImportError(
+                    "Please install flash_attn to use fused RMSNorm."
+                )
+            norm_layer_for_blocks = partial(
+                DropoutAddRMSNorm, eps=1e-6, prenorm=True
+            )
+        else:
+            norm_layer_for_blocks = partial(RMSNorm, eps=1e-6)
+        self.norm_layer_for_blocks = norm_layer_for_blocks
+        self.patch_embed = PatchEmbed(
+            config.img_size,
+            config.patch_size,
+            in_chans,
+            embed_dim,
+            num_frames=num_frames,
+            tubelet_size=self.tubelet_size,
+        )
+        num_patches = self.patch_embed.num_patches
+        num_img_patches = self.patch_embed.num_img_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.sep_pos_embed = False
+        self.sep_image_video_pos_embed = config.sep_image_video_pos_embed
+        if self.sep_pos_embed:
+            raise NotImplementedError
+        else:
+            if self.sep_image_video_pos_embed:
+                logger.info(
+                    "Use joint position embedding, for image and video we use different pos_embed."
+                )
+                self.pos_embed = nn.Parameter(
+                    torch.zeros(1, num_patches + 1, embed_dim)
+                )
+                self.img_pos_embed = nn.Parameter(
+                    torch.zeros(1, num_img_patches + 1, embed_dim)
+                )
+                # for CLIP decoder
+                self.clip_pos_embed = nn.Parameter(
+                    torch.zeros(1, num_patches + 1, embed_dim)
+                )
+                self.clip_img_pos_embed = nn.Parameter(
+                    torch.zeros(1, num_img_patches + 1, embed_dim)
+                )
+            else:
+                logger.info(
+                    "Use joint position embedding, for image and video we use same pos_embed."
+                )
+                self.pos_embed = nn.Parameter(
+                    torch.zeros(1, num_patches + 1, embed_dim)
+                )
+                self.clip_pos_embed = nn.Parameter(
+                    torch.zeros(1, num_patches + 1, embed_dim)
+                )
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        # choose which layer to use checkpoint
+        with_cp_list = [False] * depth
+        if config.use_checkpoint:
+            for idx in range(depth):
+                if idx < config.checkpoint_num:
+                    with_cp_list[idx] = True
+        logger.info(f"Droppath rate: {dpr}")
+        logger.info(f"Checkpoint list: {with_cp_list}")
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer_for_blocks,
+                    drop_path=dpr[i],
+                    init_values=init_values,
+                    attn_drop=0.0,
+                    use_flash_attn=use_flash_attn,
+                    use_fused_mlp=use_fused_mlp,
+                    fused_mlp_heuristic=1,
+                    with_cp=with_cp_list[i],
+                    qk_normalization=qk_normalization,
+                    layerscale_no_force_fp32=False,
+                    use_fused_rmsnorm=use_fused_rmsnorm,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.clip_projector = AttentionPoolingBlock(
+            dim=embed_dim,
+            num_heads=config.attn_pool_num_heads,
+            qkv_bias=True,
+            qk_scale=None,
+            drop=0.0,
+            attn_drop=0.0,
+            norm_layer=partial(nn.LayerNorm, eps=1e-5),
+            out_dim=clip_embed_dim,
+        )
+        # CLIP decoder
+        self.clip_decoder = nn.ModuleList(
+            [
+                Linear_Decoder(
+                    in_channels=embed_dim,
+                    out_channels=config.clip_teacher_embed_dim,
+                    norm_layer=partial(nn.LayerNorm, eps=1e-5),
+                    clip_norm_type=config.clip_norm_type,
+                )
+                for _ in range(config.clip_return_layer)
+            ]
+        )
+        self.final_clip_decoder = nn.Identity()
+        if config.clip_teacher_final_dim > 0:
+            self.final_clip_decoder = Linear_Decoder(
+                in_channels=clip_embed_dim,
+                out_channels=config.clip_teacher_final_dim,
+                norm_layer=partial(nn.LayerNorm, eps=1e-5),
+                clip_norm_type=config.clip_norm_type,
+            )
+        # Removed initialization methods and code
+    @property
+    def dtype(self):
+        return self.patch_embed.proj.weight.dtype
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {
+            "pos_embed",
+            "pos_embed_spatial",
+            "pos_embed_temporal",
+            "pos_embed_cls",
+            "img_pos_embed",
+            "cls_token",
+            "clip_pos_embed",
+            "clip_pos_embed_spatial",
+            "clip_pos_embed_temporal",
+            "clip_pos_embed_cls",
+            "clip_img_pos_embed",
+        }
+    def forward(
+        self,
+        x,
+        mask=None,
+        use_image=False,
+        x_vis_return_idx=-1,
+        x_vis_only=False,
+    ):
+        x = self.patch_embed(x.type(self.dtype))
+        B, T, L, C = x.shape
+        x = x.view([B, T * L, C])
+        # Append cls token
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # Add positional embeddings
+        if self.sep_pos_embed:
+            raise NotImplementedError
+        else:
+            if use_image:
+                if self.sep_image_video_pos_embed:
+                    pos_embed = self.img_pos_embed
+                else:
+                    cls_pos_embed = self.pos_embed[:, 0:1, :]
+                    img_pos_embed = (
+                        self.pos_embed[:, 1:, :]
+                        .view(
+                            1,
+                            self.num_frames,
+                            self.patch_embed.num_patches // self.num_frames,
+                            self.embed_dim,
+                        )
+                        .mean(dim=1)
+                    )
+                    pos_embed = torch.cat(
+                        [cls_pos_embed, img_pos_embed], dim=1
+                    )
+            else:
+                pos_embed = self.pos_embed
+        x = x + pos_embed
+        # Mask tokens
+        if mask is not None:
+            x = x[~mask].reshape(B, -1, C)
+        else:
+            x = x.reshape(B, -1, C)
+        residual = None
+        x_clip = []
+        for idx, blk in enumerate(self.blocks):
+            if isinstance(x, tuple) and len(x) == 2:
+                x, residual = x
+            x = blk(x, residual=residual)
+            # Return intermediate features
+            if idx in self.return_index:
+                if isinstance(x, tuple) and len(x) == 2:
+                    tmp_x, tmp_residual = x
+                    if residual is not None:
+                        x_clip.append(tmp_x + tmp_residual)
+                else:
+                    x_clip.append(x)
+            if idx == (self.depth + x_vis_return_idx):
+                break
+        if isinstance(x, tuple) and len(x) == 2:
+            x, residual = x
+            if residual is not None:
+                x = x + residual
+        x_vis = x
+        if x_vis_only:
+            return x_vis
+        x_pool_vis = self.clip_projector(x_vis)
+        x_align = self.final_clip_decoder(x_pool_vis)
+        # Align CLIP
+        x_clip = torch.stack(x_clip)
+        K, B, _, C_CLIP = x_clip.shape
+        # Add positional embeddings
+        if self.sep_pos_embed:
+            raise NotImplementedError
+        else:
+            if use_image:
+                if self.sep_image_video_pos_embed:
+                    clip_pos_embed = self.clip_img_pos_embed
+                else:
+                    clip_cls_pos_embed = self.clip_pos_embed[:, 0:1, :]
+                    clip_img_pos_embed = (
+                        self.clip_pos_embed[:, 1:, :]
+                        .view(
+                            1,
+                            self.num_frames,
+                            self.patch_embed.num_patches // self.num_frames,
+                            self.embed_dim,
+                        )
+                        .mean(dim=1)
+                    )
+                    clip_pos_embed = torch.cat(
+                        [clip_cls_pos_embed, clip_img_pos_embed], dim=1
+                    )
+            else:
+                clip_pos_embed = self.clip_pos_embed
+        clip_pos_embed = clip_pos_embed.repeat(B, 1, 1)
+        if mask is not None:
+            x_clip = x_clip + clip_pos_embed[~mask].view(
+                B, -1, C_CLIP
+            ).unsqueeze(0).repeat(K, 1, 1, 1)
+        else:
+            x_clip = x_clip + clip_pos_embed.view(B, -1, C_CLIP).unsqueeze(
+                0
+            ).repeat(K, 1, 1, 1)
+        # CLIP decoder
+        x_clip_align = []
+        for idx, clip_decoder in enumerate(self.clip_decoder):
+            x_clip_align.append(clip_decoder(x_clip[idx]))
+        x_clip_align = torch.stack(x_clip_align)
+        return x_vis, x_pool_vis, x_clip_align, x_align
+    def load_pretrained_weights(self):
+        if self.config.pretrained is not None:
+            logger.info(f"Loading pretrained weights from {self.config.pretrained}")
+            state_dict = torch.load(self.config.pretrained, map_location='cpu')
+            # Rename 'ls1.weight' to 'ls1.weight' and 'ls2.weight' to 'ls2.weight'
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                if key.endswith('.ls1.weight'):
+                    new_key = key.replace('.ls1.weight', '.ls1.weight')
+                    new_state_dict[new_key] = value
+                elif key.endswith('.ls2.weight'):
+                    new_key = key.replace('.ls2.weight', '.ls2.weight')
+                    new_state_dict[new_key] = value
+                else:
+                    new_state_dict[key] = value
+            # Load the adjusted state_dict
+            message = self.load_state_dict(new_state_dict, strict=False)
+            logger.info(message)
+        else:
+            logger.info("No pretrained weights provided.")

vision_tower/internvideo2/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "Resize": {
+        "size": 224,
+        "interpolation": "bilinear"
+    },
+    "CenterCrop": {
+        "size": [
+            224,
+            224
+        ]
+    },
+    "ClipToTensor": {
+        "channel_nb": 3,
+        "div_255": true,
+        "numpy": false
+    },
+    "Normalize": {
+        "mean": [
+            0.485,
+            0.456,
+            0.406
+        ],
+        "std": [
+            0.229,
+            0.224,
+            0.225
+        ]
+    },
+    "image_processor_type": "transforms"
+}

vision_tower/siglip-so400m-patch14-384/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_name_or_path": "/opt/hpcaas/.mounts/fs-0663e2d3c38211883/home/orrzohar/Artemis/work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft/checkpoint-13300/vision_tower/siglip-so400m-patch14-384",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 384,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "patch_size": 14,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0"
+}

vision_tower/siglip-so400m-patch14-384/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9299e05ae6cc8366374648c1a8d202d57baa136e117c2203a23d135dbb0707
+size 856506120

vision_tower/siglip-so400m-patch14-384/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}