hyz317 commited on
Commit
e322986
·
verified ·
1 Parent(s): b1a9b83

Upload 46 files

Browse files
Files changed (46) hide show
  1. StdGEN-canonicalize-1024/feature_extractor/preprocessor_config.json +20 -0
  2. StdGEN-canonicalize-1024/image_encoder/config.json +23 -0
  3. StdGEN-canonicalize-1024/image_encoder/pytorch_model.bin +3 -0
  4. StdGEN-canonicalize-1024/model_index.json +33 -0
  5. StdGEN-canonicalize-1024/ref_unet/config.json +45 -0
  6. StdGEN-canonicalize-1024/ref_unet/diffusion_pytorch_model.safetensors +3 -0
  7. StdGEN-canonicalize-1024/scheduler-zerosnr/scheduler_config.json +15 -0
  8. StdGEN-canonicalize-1024/text_encoder/config.json +25 -0
  9. StdGEN-canonicalize-1024/text_encoder/model.fp16.safetensors +3 -0
  10. StdGEN-canonicalize-1024/text_encoder/model.safetensors +3 -0
  11. StdGEN-canonicalize-1024/text_encoder/pytorch_model.bin +3 -0
  12. StdGEN-canonicalize-1024/text_encoder/pytorch_model.fp16.bin +3 -0
  13. StdGEN-canonicalize-1024/tokenizer/merges.txt +0 -0
  14. StdGEN-canonicalize-1024/tokenizer/special_tokens_map.json +24 -0
  15. StdGEN-canonicalize-1024/tokenizer/tokenizer_config.json +34 -0
  16. StdGEN-canonicalize-1024/tokenizer/vocab.json +0 -0
  17. StdGEN-canonicalize-1024/unet/config.json +45 -0
  18. StdGEN-canonicalize-1024/unet/diffusion_pytorch_model.safetensors +3 -0
  19. StdGEN-canonicalize-1024/vae/config.json +29 -0
  20. StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.bin +3 -0
  21. StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.fp16.bin +3 -0
  22. StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.fp16.safetensors +3 -0
  23. StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.safetensors +3 -0
  24. StdGEN-mesh-slrm.pth +3 -0
  25. StdGEN-multiview-1024/.gitattributes +35 -0
  26. StdGEN-multiview-1024/README.md +3 -0
  27. StdGEN-multiview-1024/feature_extractor/preprocessor_config.json +27 -0
  28. StdGEN-multiview-1024/image_encoder/config.json +23 -0
  29. StdGEN-multiview-1024/image_encoder/model.safetensors +3 -0
  30. StdGEN-multiview-1024/image_noising_scheduler/scheduler_config.json +19 -0
  31. StdGEN-multiview-1024/image_normalizer/config.json +6 -0
  32. StdGEN-multiview-1024/image_normalizer/diffusion_pytorch_model.safetensors +3 -0
  33. StdGEN-multiview-1024/model_index.json +40 -0
  34. StdGEN-multiview-1024/scheduler/scheduler_config.json +20 -0
  35. StdGEN-multiview-1024/text_encoder/config.json +25 -0
  36. StdGEN-multiview-1024/text_encoder/model.safetensors +3 -0
  37. StdGEN-multiview-1024/tokenizer/merges.txt +0 -0
  38. StdGEN-multiview-1024/tokenizer/special_tokens_map.json +24 -0
  39. StdGEN-multiview-1024/tokenizer/tokenizer_config.json +38 -0
  40. StdGEN-multiview-1024/tokenizer/vocab.json +0 -0
  41. StdGEN-multiview-1024/unet-old/config.json +95 -0
  42. StdGEN-multiview-1024/unet-old/diffusion_pytorch_model.safetensors +3 -0
  43. StdGEN-multiview-1024/unet/config.json +95 -0
  44. StdGEN-multiview-1024/unet/diffusion_pytorch_model.safetensors +3 -0
  45. StdGEN-multiview-1024/vae/config.json +32 -0
  46. StdGEN-multiview-1024/vae/diffusion_pytorch_model.safetensors +3 -0
StdGEN-canonicalize-1024/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_resize": true,
7
+ "feature_extractor_type": "CLIPFeatureExtractor",
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "resample": 3,
19
+ "size": 224
20
+ }
StdGEN-canonicalize-1024/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./image_encoder",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.28.0.dev0"
23
+ }
StdGEN-canonicalize-1024/image_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d3ec1e66737f77a4f3bc2df3c52eacefc69ce7825e2784183b1d4e9877d9193
3
+ size 2528481905
StdGEN-canonicalize-1024/model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "requires_safety_checker": false,
9
+ "safety_checker": [
10
+ null,
11
+ null
12
+ ],
13
+ "scheduler": [
14
+ "diffusers",
15
+ "PNDMScheduler"
16
+ ],
17
+ "text_encoder": [
18
+ "transformers",
19
+ "CLIPTextModel"
20
+ ],
21
+ "tokenizer": [
22
+ "transformers",
23
+ "CLIPTokenizer"
24
+ ],
25
+ "unet": [
26
+ "diffusers",
27
+ "UNet2DConditionModel"
28
+ ],
29
+ "vae": [
30
+ "diffusers",
31
+ "AutoencoderKL"
32
+ ]
33
+ }
StdGEN-canonicalize-1024/ref_unet/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": [
6
+ 5,
7
+ 10,
8
+ 20,
9
+ 20
10
+ ],
11
+ "block_out_channels": [
12
+ 320,
13
+ 640,
14
+ 1280,
15
+ 1280
16
+ ],
17
+ "center_input_sample": false,
18
+ "cross_attention_dim": 1024,
19
+ "down_block_types": [
20
+ "CrossAttnDownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "DownBlock2D"
24
+ ],
25
+ "downsample_padding": 1,
26
+ "dual_cross_attention": false,
27
+ "flip_sin_to_cos": true,
28
+ "freq_shift": 0,
29
+ "in_channels": 4,
30
+ "layers_per_block": 2,
31
+ "mid_block_scale_factor": 1,
32
+ "norm_eps": 1e-05,
33
+ "norm_num_groups": 32,
34
+ "num_class_embeds": null,
35
+ "only_cross_attention": false,
36
+ "out_channels": 4,
37
+ "sample_size": 64,
38
+ "up_block_types": [
39
+ "UpBlock2D",
40
+ "CrossAttnUpBlock2D",
41
+ "CrossAttnUpBlock2D",
42
+ "CrossAttnUpBlock2D"
43
+ ],
44
+ "use_linear_projection": true
45
+ }
StdGEN-canonicalize-1024/ref_unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a38c12b73707c077b7b3655d3e86c8ae0bed719c5797fd1780b435dba3631ec6
3
+ size 3476967656
StdGEN-canonicalize-1024/scheduler-zerosnr/scheduler_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.8.0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "prediction_type": "v_prediction",
10
+ "set_alpha_to_one": false,
11
+ "skip_prk_steps": true,
12
+ "steps_offset": 1,
13
+ "trained_betas": null,
14
+ "rescale_betas_zero_snr": true
15
+ }
StdGEN-canonicalize-1024/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.25.0.dev0",
24
+ "vocab_size": 49408
25
+ }
StdGEN-canonicalize-1024/text_encoder/model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:681c555376658c81dc273f2d737a2aeb23ddb6d1d8e5b3a7064636d359a22668
3
+ size 680821096
StdGEN-canonicalize-1024/text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce6febb0b6d876ee5eb24af35e27e764eb4f9b1d0b7c026c8c3333d4cfc916c
3
+ size 1361597018
StdGEN-canonicalize-1024/text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e254d7b61353497ea0be2c4013df4ea8f739ee88cffa0ba58cd085459ed565
3
+ size 1361671895
StdGEN-canonicalize-1024/text_encoder/pytorch_model.fp16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb11b1da63986aaaaefb5ef2100d34109c024ac640cacd9ed697150c1c57f01
3
+ size 680900852
StdGEN-canonicalize-1024/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
StdGEN-canonicalize-1024/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
StdGEN-canonicalize-1024/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "stabilityai/stable-diffusion-2",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
StdGEN-canonicalize-1024/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
StdGEN-canonicalize-1024/unet/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": [
6
+ 5,
7
+ 10,
8
+ 20,
9
+ 20
10
+ ],
11
+ "block_out_channels": [
12
+ 320,
13
+ 640,
14
+ 1280,
15
+ 1280
16
+ ],
17
+ "center_input_sample": false,
18
+ "cross_attention_dim": 1024,
19
+ "down_block_types": [
20
+ "CrossAttnDownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "DownBlock2D"
24
+ ],
25
+ "downsample_padding": 1,
26
+ "dual_cross_attention": false,
27
+ "flip_sin_to_cos": true,
28
+ "freq_shift": 0,
29
+ "in_channels": 4,
30
+ "layers_per_block": 2,
31
+ "mid_block_scale_factor": 1,
32
+ "norm_eps": 1e-05,
33
+ "norm_num_groups": 32,
34
+ "num_class_embeds": null,
35
+ "only_cross_attention": false,
36
+ "out_channels": 4,
37
+ "sample_size": 64,
38
+ "up_block_types": [
39
+ "UpBlock2D",
40
+ "CrossAttnUpBlock2D",
41
+ "CrossAttnUpBlock2D",
42
+ "CrossAttnUpBlock2D"
43
+ ],
44
+ "use_linear_projection": true
45
+ }
StdGEN-canonicalize-1024/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1428ef2f6229f42edb4ed184ad42db944ff04bcac88555ea97537464b0b452e7
3
+ size 3476967656
StdGEN-canonicalize-1024/vae/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.10.0.dev0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "norm_num_groups": 32,
21
+ "out_channels": 3,
22
+ "sample_size": 768,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
+ size 334707217
StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.fp16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44915add42092106e70bffac475aae4283b5e8167a8a0c5f55ccc667ee4ebeb5
3
+ size 167405651
StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342
StdGEN-canonicalize-1024/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
3
+ size 334643276
StdGEN-mesh-slrm.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d67d8e80738a15d73ddb750d24994a6a9469fb4311c505bec0c924d3b58d2e0e
3
+ size 1628189934
StdGEN-multiview-1024/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
StdGEN-multiview-1024/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
StdGEN-multiview-1024/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
StdGEN-multiview-1024/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.37.2"
23
+ }
StdGEN-multiview-1024/image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae616c24393dd1854372b0639e5541666f7521cbe219669255e865cb7f89466a
3
+ size 1264217240
StdGEN-multiview-1024/image_noising_scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDPMScheduler",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "beta_end": 0.02,
5
+ "beta_schedule": "squaredcos_cap_v2",
6
+ "beta_start": 0.0001,
7
+ "clip_sample": true,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "steps_offset": 0,
15
+ "thresholding": false,
16
+ "timestep_spacing": "leading",
17
+ "trained_betas": null,
18
+ "variance_type": "fixed_small"
19
+ }
StdGEN-multiview-1024/image_normalizer/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableUnCLIPImageNormalizer",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
5
+ "embedding_dim": 1024
6
+ }
StdGEN-multiview-1024/image_normalizer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7772cf09639cea0c65639a3bfc88004a66d42259090d03fa8e15efdc255f240a
3
+ size 4272
StdGEN-multiview-1024/model_index.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableUnCLIPImg2ImgPipeline",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "image_encoder": [
9
+ "transformers",
10
+ "CLIPVisionModelWithProjection"
11
+ ],
12
+ "image_noising_scheduler": [
13
+ "diffusers",
14
+ "DDPMScheduler"
15
+ ],
16
+ "image_normalizer": [
17
+ "stable_diffusion",
18
+ "StableUnCLIPImageNormalizer"
19
+ ],
20
+ "scheduler": [
21
+ "diffusers",
22
+ "DDIMScheduler"
23
+ ],
24
+ "text_encoder": [
25
+ "transformers",
26
+ "CLIPTextModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "CLIPTokenizer"
31
+ ],
32
+ "unet": [
33
+ "multiview.models.unet_mv2d_condition",
34
+ "UNetMV2DConditionModel"
35
+ ],
36
+ "vae": [
37
+ "diffusers",
38
+ "AutoencoderKL"
39
+ ]
40
+ }
StdGEN-multiview-1024/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "v_prediction",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "skip_prk_steps": true,
16
+ "steps_offset": 1,
17
+ "thresholding": false,
18
+ "timestep_spacing": "leading",
19
+ "trained_betas": null
20
+ }
StdGEN-multiview-1024/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.37.2",
24
+ "vocab_size": 49408
25
+ }
StdGEN-multiview-1024/text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
+ size 680820392
StdGEN-multiview-1024/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
StdGEN-multiview-1024/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
StdGEN-multiview-1024/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
StdGEN-multiview-1024/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
StdGEN-multiview-1024/unet-old/config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetMV2DConditionModel",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "../checkpoint_backup/output/unit-unclip-512-6view-randomele-self+row-8w-selfcd-rowmv-2block-linear-depthfilter-step-removewh-bs256-three/checkpoint-40000/unet_ema",
5
+ "act_fn": "silu",
6
+ "addition_channels": [
7
+ 1280,
8
+ 1280,
9
+ 1280
10
+ ],
11
+ "addition_downsample": false,
12
+ "addition_embed_type": null,
13
+ "addition_embed_type_num_heads": 64,
14
+ "addition_time_embed_dim": null,
15
+ "attention_head_dim": [
16
+ 5,
17
+ 10,
18
+ 20,
19
+ 20
20
+ ],
21
+ "block_out_channels": [
22
+ 320,
23
+ 640,
24
+ 1280,
25
+ 1280
26
+ ],
27
+ "cd_attention_last": false,
28
+ "cd_attention_mid": false,
29
+ "center_input_sample": false,
30
+ "class_embed_type": "projection",
31
+ "class_embeddings_concat": false,
32
+ "conv_in_kernel": 3,
33
+ "conv_out_kernel": 3,
34
+ "cross_attention_dim": 1024,
35
+ "cross_attention_norm": null,
36
+ "decay": 0.9999,
37
+ "down_block_types": [
38
+ "CrossAttnDownBlockMV2D",
39
+ "CrossAttnDownBlockMV2D",
40
+ "CrossAttnDownBlockMV2D",
41
+ "DownBlock2D"
42
+ ],
43
+ "downsample_padding": 1,
44
+ "dual_cross_attention": false,
45
+ "encoder_hid_dim": null,
46
+ "encoder_hid_dim_type": null,
47
+ "flip_sin_to_cos": true,
48
+ "freq_shift": 0,
49
+ "in_channels": 8,
50
+ "inv_gamma": 1.0,
51
+ "layers_per_block": 2,
52
+ "mid_block_only_cross_attention": null,
53
+ "mid_block_scale_factor": 1,
54
+ "mid_block_type": "UNetMidBlockMV2DCrossAttn",
55
+ "min_decay": 0.0,
56
+ "multiview_attention": true,
57
+ "mvcd_attention": true,
58
+ "norm_eps": 1e-05,
59
+ "norm_num_groups": 32,
60
+ "num_attention_heads": null,
61
+ "num_class_embeds": null,
62
+ "num_regress_blocks": 3,
63
+ "num_views": 6,
64
+ "only_cross_attention": false,
65
+ "optimization_step": 40000,
66
+ "out_channels": 4,
67
+ "power": 0.6666666666666666,
68
+ "projection_camera_embeddings_input_dim": 4,
69
+ "projection_class_embeddings_input_dim": 2048,
70
+ "regress_elevation": true,
71
+ "regress_focal_length": true,
72
+ "resnet_out_scale_factor": 1.0,
73
+ "resnet_skip_time_act": false,
74
+ "resnet_time_scale_shift": "default",
75
+ "sample_size": 64,
76
+ "selfattn_block": "self_rowwise",
77
+ "sparse_mv_attention": true,
78
+ "time_cond_proj_dim": null,
79
+ "time_embedding_act_fn": null,
80
+ "time_embedding_dim": null,
81
+ "time_embedding_type": "positional",
82
+ "timestep_post_act": null,
83
+ "transformer_layers_per_block": 1,
84
+ "up_block_types": [
85
+ "UpBlock2D",
86
+ "CrossAttnUpBlockMV2D",
87
+ "CrossAttnUpBlockMV2D",
88
+ "CrossAttnUpBlockMV2D"
89
+ ],
90
+ "upcast_attention": true,
91
+ "update_after_step": 0,
92
+ "use_dino": false,
93
+ "use_ema_warmup": false,
94
+ "use_linear_projection": true
95
+ }
StdGEN-multiview-1024/unet-old/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af12a68fdbfa51bb857422b42bd5ac5101467e22e5e58ea6d8b06dd9426c93af
3
+ size 1895432652
StdGEN-multiview-1024/unet/config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetMV2DConditionModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "_name_or_path": "/mnt/ceph-AT/private/yuzehe/git/animegen/sd-model-finetuned-20241026-768/checkpoint-12000",
5
+ "act_fn": "silu",
6
+ "addition_channels": [
7
+ 1280,
8
+ 1280,
9
+ 1280
10
+ ],
11
+ "addition_downsample": false,
12
+ "addition_embed_type": null,
13
+ "addition_embed_type_num_heads": 64,
14
+ "addition_time_embed_dim": null,
15
+ "attention_head_dim": [
16
+ 5,
17
+ 10,
18
+ 20,
19
+ 20
20
+ ],
21
+ "block_out_channels": [
22
+ 320,
23
+ 640,
24
+ 1280,
25
+ 1280
26
+ ],
27
+ "cd_attention_last": false,
28
+ "cd_attention_mid": false,
29
+ "center_input_sample": false,
30
+ "class_embed_type": "projection",
31
+ "class_embeddings_concat": false,
32
+ "conv_in_kernel": 3,
33
+ "conv_out_kernel": 3,
34
+ "cross_attention_dim": 1024,
35
+ "cross_attention_norm": null,
36
+ "decay": 0.9999,
37
+ "down_block_types": [
38
+ "CrossAttnDownBlockMV2D",
39
+ "CrossAttnDownBlockMV2D",
40
+ "CrossAttnDownBlockMV2D",
41
+ "DownBlock2D"
42
+ ],
43
+ "downsample_padding": 1,
44
+ "dual_cross_attention": false,
45
+ "encoder_hid_dim": null,
46
+ "encoder_hid_dim_type": null,
47
+ "flip_sin_to_cos": true,
48
+ "freq_shift": 0,
49
+ "in_channels": 8,
50
+ "inv_gamma": 1.0,
51
+ "layers_per_block": 2,
52
+ "mid_block_only_cross_attention": null,
53
+ "mid_block_scale_factor": 1,
54
+ "mid_block_type": "UNetMidBlockMV2DCrossAttn",
55
+ "min_decay": 0.0,
56
+ "multiview_attention": true,
57
+ "mvcd_attention": true,
58
+ "norm_eps": 1e-05,
59
+ "norm_num_groups": 32,
60
+ "num_attention_heads": null,
61
+ "num_class_embeds": null,
62
+ "num_regress_blocks": 3,
63
+ "num_views": 6,
64
+ "only_cross_attention": false,
65
+ "optimization_step": 40000,
66
+ "out_channels": 4,
67
+ "power": 0.6666666666666666,
68
+ "projection_camera_embeddings_input_dim": 4,
69
+ "projection_class_embeddings_input_dim": 2048,
70
+ "regress_elevation": true,
71
+ "regress_focal_length": true,
72
+ "resnet_out_scale_factor": 1.0,
73
+ "resnet_skip_time_act": false,
74
+ "resnet_time_scale_shift": "default",
75
+ "sample_size": 64,
76
+ "selfattn_block": "self_rowwise",
77
+ "sparse_mv_attention": true,
78
+ "time_cond_proj_dim": null,
79
+ "time_embedding_act_fn": null,
80
+ "time_embedding_dim": null,
81
+ "time_embedding_type": "positional",
82
+ "timestep_post_act": null,
83
+ "transformer_layers_per_block": 1,
84
+ "up_block_types": [
85
+ "UpBlock2D",
86
+ "CrossAttnUpBlockMV2D",
87
+ "CrossAttnUpBlockMV2D",
88
+ "CrossAttnUpBlockMV2D"
89
+ ],
90
+ "upcast_attention": true,
91
+ "update_after_step": 0,
92
+ "use_dino": false,
93
+ "use_ema_warmup": false,
94
+ "use_linear_projection": true
95
+ }
StdGEN-multiview-1024/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc9181f857ce347dc2bf3ca7f5b6abd814f98bfc6f029868866b7f65ee642c4a
3
+ size 1895432652
StdGEN-multiview-1024/vae/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "layers_per_block": 2,
22
+ "norm_num_groups": 32,
23
+ "out_channels": 3,
24
+ "sample_size": 768,
25
+ "scaling_factor": 0.18215,
26
+ "up_block_types": [
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D"
31
+ ]
32
+ }
StdGEN-multiview-1024/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342