Gengzigang commited on
Commit
845a82f
1 Parent(s): 2059c91
Files changed (1) hide show
  1. README.md +35 -14
README.md CHANGED
@@ -28,28 +28,49 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
28
 
29
  ## Usage
30
 
31
- ### Huggingface Version
 
32
  ```python
 
 
 
 
 
 
33
  from PIL import Image
34
- from transformers import AutoModel
35
- from transformers import CLIPImageProcessor
36
  import torch
37
 
38
- image_path = "CLIP.png"
39
- model_name_or_path = "LLM2CLIP-EVA02-B-16" # or /path/to/local/LLM2CLIP-EVA02-B-16
40
- image_size = 224
41
 
42
- processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
43
- model = AutoModel.from_pretrained(
44
- model_name_or_path,
45
- torch_dtype=torch.float16,
46
- trust_remote_code=True).to('cuda').eval()
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- image = Image.open(image_path)
49
- input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
50
 
51
  with torch.no_grad(), torch.cuda.amp.autocast():
52
- outputs = model.get_image_features(input_pixels)
 
 
 
 
 
 
 
 
53
  ```
54
 
55
  ## BibTeX & Citation
 
28
 
29
  ## Usage
30
 
31
+ ### Pytorch Version
32
+ Go to [GitHub](https://github.com/microsoft/LLM2CLIP/tree/main/llm2clip)
33
  ```python
34
+ import os
35
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
36
+
37
+ from transformers import AutoModel, AutoConfig, AutoTokenizer
38
+ from eva_clip import create_model_and_transforms
39
+ from llm2vec import LLM2Vec
40
  from PIL import Image
 
 
41
  import torch
42
 
 
 
 
43
 
44
+ model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
45
+ ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
46
+ model.load_state_dict(ckpt)
47
+ model = model.cuda().eval()
48
+
49
+ llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
50
+ config = AutoConfig.from_pretrained(
51
+ llm_model_name, trust_remote_code=True
52
+ )
53
+ llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
54
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
55
+ llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
56
+ l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
57
+
58
+ image_path = "CLIP.png"
59
+ captions = ["a diagram", "a dog", "a cat"]
60
 
61
+ image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
62
+ text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
63
 
64
  with torch.no_grad(), torch.cuda.amp.autocast():
65
+ image_features = model.encode_image(image)
66
+ text_features = model.encode_text(text_features)
67
+
68
+ image_features /= image_features.norm(dim=-1, keepdim=True)
69
+ text_features /= text_features.norm(dim=-1, keepdim=True)
70
+
71
+ text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
72
+
73
+ print("Label probs:", text_probs)
74
  ```
75
 
76
  ## BibTeX & Citation