PhyscalX commited on
Commit
1355d9b
1 Parent(s): 764cc00

Fix ViT-H builder

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. tokenize_anything/models/easy_build.py +2 -0
app.py CHANGED
@@ -31,7 +31,7 @@ from tokenize_anything.utils.image import im_vstack
31
  def parse_args():
32
  """Parse arguments."""
33
  parser = argparse.ArgumentParser(description="Launch gradio application")
34
- parser.add_argument("--model-type", type=str, default="tap_vit_l")
35
  parser.add_argument("--checkpoint", type=str, default="models/tap_vit_h_v1_1.pkl")
36
  parser.add_argument("--concept", type=str, default="concepts/merged_2560.pkl")
37
  parser.add_argument("--device", nargs="+", type=int, default=[0], help="Index of devices")
 
31
  def parse_args():
32
  """Parse arguments."""
33
  parser = argparse.ArgumentParser(description="Launch gradio application")
34
+ parser.add_argument("--model-type", type=str, default="tap_vit_h")
35
  parser.add_argument("--checkpoint", type=str, default="models/tap_vit_h_v1_1.pkl")
36
  parser.add_argument("--concept", type=str, default="concepts/merged_2560.pkl")
37
  parser.add_argument("--device", nargs="+", type=int, default=[0], help="Index of devices")
tokenize_anything/models/easy_build.py CHANGED
@@ -106,8 +106,10 @@ def image_tokenizer(image_encoder, checkpoint=None, device=0, dtype="float16", *
106
 
107
  vit_b_encoder = partial(vit_encoder, depth=12, embed_dim=768, num_heads=12)
108
  vit_l_encoder = partial(vit_encoder, depth=24, embed_dim=1024, num_heads=16)
 
109
 
110
  model_registry = {
111
  "tap_vit_b": partial(image_tokenizer, image_encoder=vit_b_encoder),
112
  "tap_vit_l": partial(image_tokenizer, image_encoder=vit_l_encoder),
 
113
  }
 
106
 
107
  vit_b_encoder = partial(vit_encoder, depth=12, embed_dim=768, num_heads=12)
108
  vit_l_encoder = partial(vit_encoder, depth=24, embed_dim=1024, num_heads=16)
109
+ vit_h_encoder = partial(vit_encoder, depth=32, embed_dim=1280, num_heads=16)
110
 
111
  model_registry = {
112
  "tap_vit_b": partial(image_tokenizer, image_encoder=vit_b_encoder),
113
  "tap_vit_l": partial(image_tokenizer, image_encoder=vit_l_encoder),
114
+ "tap_vit_h": partial(image_tokenizer, image_encoder=vit_h_encoder),
115
  }