jinaai
/

jina-clip-v2

Model card Files Files and versions Community

bwang0911 commited on Oct 8

Commit

af015b7

•

1 Parent(s): 5111722

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +30 -3
config.json +177 -0
preprocessor_config.json +22 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +54 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,30 @@
----
-license: cc-by-nc-4.0
----

+## Usage
+Similar usage to jina-clip-v1, the only difference is that it can use matryoshka embeddings, through the truncate_dim argument on the encode_text and encode_image features
+```python
+!pip install transformers einops timm pillow
+from transformers import AutoModel
+# Initialize the model
+model = AutoModel.from_pretrained('jinaai/jina-clip-v2-test', trust_remote_code=True)
+# New meaningful sentences
+sentences = ['A blue cat', 'A red cat']
+# Public image URLs
+image_urls = [
+    'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
+    'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
+]
+# Encode text and images
+truncate = 512
+text_embeddings = model.encode_text(sentences, truncate_dim = truncate)
+image_embeddings = model.encode_image(image_urls, truncate_dim = truncate)  # also accepts PIL.image, local filenames, dataURI
+# Compute similarities
+print(text_embeddings[0] @ text_embeddings[1].T) # text embedding similarity
+print(text_embeddings[0] @ image_embeddings[0].T) # text-image cross-modal similarity
+print(text_embeddings[0] @ image_embeddings[1].T) # text-image cross-modal similarity
+print(text_embeddings[1] @ image_embeddings[0].T) # text-image cross-modal similarity
+print(text_embeddings[1] @ image_embeddings[1].T)# text-image cross-modal similarity
+```

config.json ADDED Viewed

	@@ -0,0 +1,177 @@

+{
+    "_commit_hash": null,
+    "_name_or_path": "jinaai/jina-clip-v2-test",
+    "add_projections": false,
+    "architectures": [
+        "JinaCLIPModel"
+    ],
+    "auto_map": {
+        "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
+        "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
+    },
+    "initializer_factor": 1.0,
+    "logit_scale_init_value": 2.6592,
+    "model_type": "jina_clip",
+    "projection_dim": 1024,
+    "matryoshka_dimensions": [32, 64, 128, 256, 512, 768, 1024],
+    "text_config": {
+        "_name_or_path": "",
+        "add_cross_attention": false,
+        "architectures": null,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "bos_token_id": null,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": null,
+        "decoder_start_token_id": null,
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": false,
+        "embed_dim": 1024,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": null,
+        "exponential_decay_length_penalty": null,
+        "finetuning_task": null,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "hf_model_config_kwargs": {
+            "use_flash_attn": false
+        },
+        "hf_model_name_or_path": "jinaai/jina-xlm-roberta-large-rope-8k",
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "is_decoder": false,
+        "is_encoder_decoder": false,
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "min_length": 0,
+        "model_type": "jina_clip_text",
+        "no_repeat_ngram_size": 0,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "output_scores": false,
+        "pad_token_id": null,
+        "pooler_type": "mean_pooler",
+        "prefix": null,
+        "problem_type": null,
+        "proj_bias": false,
+        "proj_type": null,
+        "pruned_heads": {},
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict": true,
+        "return_dict_in_generate": false,
+        "sep_token_id": null,
+        "suppress_tokens": null,
+        "task_specific_params": null,
+        "temperature": 1.0,
+        "tf_legacy_loss": false,
+        "tie_encoder_decoder": false,
+        "tie_word_embeddings": true,
+        "tokenizer_class": null,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torch_dtype": null,
+        "torchscript": false,
+        "transformers_version": "4.42.4",
+        "typical_p": 1.0,
+        "use_bfloat16": false
+    },
+    "torch_dtype": "float16",
+    "transformers_version": null,
+    "use_text_flash_attn": null,
+    "use_vision_xformers": null,
+    "vision_config": {
+        "_name_or_path": "",
+        "add_cross_attention": false,
+        "architectures": null,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "bos_token_id": null,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": null,
+        "decoder_start_token_id": null,
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "drop_path_rate": 0.0,
+        "early_stopping": false,
+        "embed_dim": 1024,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": null,
+        "exponential_decay_length_penalty": null,
+        "finetuning_task": null,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "fused_layer_norm": false,
+        "head_width": 64,
+        "id2label": {
+            "0": "LABEL_0",
+            "1": "LABEL_1"
+        },
+        "image_size": 384,
+        "intp_freq": true,
+        "is_decoder": false,
+        "is_encoder_decoder": false,
+        "label2id": {
+            "LABEL_0": 0,
+            "LABEL_1": 1
+        },
+        "layers": 24,
+        "length_penalty": 1.0,
+        "ls_init_value": null,
+        "max_length": 20,
+        "min_length": 0,
+        "mlp_ratio": 2.6667,
+        "model_type": "jina_clip_vision",
+        "naive_swiglu": true,
+        "no_repeat_ngram_size": 0,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "output_scores": false,
+        "pad_token_id": null,
+        "patch_dropout": 0.1,
+        "patch_size": 14,
+        "post_norm": false,
+        "prefix": null,
+        "problem_type": null,
+        "proj_type": null,
+        "pruned_heads": {},
+        "pt_hw_seq_len": 16,
+        "qkv_bias": true,
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict": true,
+        "return_dict_in_generate": false,
+        "rope_embeddings": true,
+        "sep_token_id": null,
+        "subln": true,
+        "suppress_tokens": null,
+        "task_specific_params": null,
+        "temperature": 1.0,
+        "tf_legacy_loss": false,
+        "tie_encoder_decoder": false,
+        "tie_word_embeddings": true,
+        "tokenizer_class": null,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torch_dtype": null,
+        "torchscript": false,
+        "transformers_version": "4.42.4",
+        "typical_p": 1.0,
+        "use_bfloat16": false,
+        "width": 1024,
+        "x_attention": false
+    }
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPImageProcessor",
+        "AutoProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPProcessor"
+    },
+    "fill_color": 0,
+    "image_processor_type": "JinaCLIPImageProcessor",
+    "interpolation": "bicubic",
+    "mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "processor_class": "JinaCLIPProcessor",
+    "resize_mode": "shortest",
+    "size": 384,
+    "std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ]
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
+size 17082734

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 8194,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}