turn-the-cam-anonymous commited on
Commit
dc1ad90
·
1 Parent(s): 7fe4518

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +21 -0
  2. app.py +219 -0
  3. configs/sd-objaverse-finetune-c_concat-256.yaml +117 -0
  4. gradio_new.py +663 -0
  5. gradio_objaverse.py +184 -0
  6. instructions.md +11 -0
  7. ldm/data/__init__.py +0 -0
  8. ldm/data/base.py +40 -0
  9. ldm/data/coco.py +253 -0
  10. ldm/data/dummy.py +34 -0
  11. ldm/data/imagenet.py +394 -0
  12. ldm/data/inpainting/__init__.py +0 -0
  13. ldm/data/inpainting/synthetic_mask.py +166 -0
  14. ldm/data/laion.py +537 -0
  15. ldm/data/lsun.py +92 -0
  16. ldm/data/nerf_like.py +165 -0
  17. ldm/data/simple.py +526 -0
  18. ldm/extras.py +77 -0
  19. ldm/guidance.py +96 -0
  20. ldm/lr_scheduler.py +98 -0
  21. ldm/models/autoencoder.py +443 -0
  22. ldm/models/diffusion/__init__.py +0 -0
  23. ldm/models/diffusion/classifier.py +267 -0
  24. ldm/models/diffusion/ddim.py +324 -0
  25. ldm/models/diffusion/ddpm.py +1994 -0
  26. ldm/models/diffusion/plms.py +259 -0
  27. ldm/models/diffusion/sampling_util.py +50 -0
  28. ldm/modules/attention.py +266 -0
  29. ldm/modules/diffusionmodules/__init__.py +0 -0
  30. ldm/modules/diffusionmodules/model.py +835 -0
  31. ldm/modules/diffusionmodules/openaimodel.py +996 -0
  32. ldm/modules/diffusionmodules/util.py +267 -0
  33. ldm/modules/distributions/__init__.py +0 -0
  34. ldm/modules/distributions/distributions.py +92 -0
  35. ldm/modules/ema.py +76 -0
  36. ldm/modules/encoders/__init__.py +0 -0
  37. ldm/modules/encoders/modules.py +550 -0
  38. ldm/modules/evaluate/adm_evaluator.py +676 -0
  39. ldm/modules/evaluate/evaluate_perceptualsim.py +630 -0
  40. ldm/modules/evaluate/frechet_video_distance.py +147 -0
  41. ldm/modules/evaluate/ssim.py +124 -0
  42. ldm/modules/evaluate/torch_frechet_video_distance.py +294 -0
  43. ldm/modules/image_degradation/__init__.py +2 -0
  44. ldm/modules/image_degradation/bsrgan.py +730 -0
  45. ldm/modules/image_degradation/bsrgan_light.py +650 -0
  46. ldm/modules/image_degradation/utils/test.png +0 -0
  47. ldm/modules/image_degradation/utils_image.py +916 -0
  48. ldm/modules/losses/__init__.py +1 -0
  49. ldm/modules/losses/contperceptual.py +111 -0
  50. ldm/modules/losses/vqperceptual.py +167 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import fire
3
+ import gradio as gr
4
+ import numpy as np
5
+ import rich
6
+ import torch
7
+ from contextlib import nullcontext
8
+ from einops import rearrange
9
+ from functools import partial
10
+ from ldm.models.diffusion.ddim import DDIMSampler
11
+ from ldm.util import load_and_preprocess, instantiate_from_config
12
+ from omegaconf import OmegaConf
13
+ from PIL import Image
14
+ from rich import print
15
+ from torch import autocast
16
+ from torchvision import transforms
17
+
18
+
19
+ _SHOW_INTERMEDIATE = True
20
+ _GPU_INDEX = 0
21
+ # _GPU_INDEX = 2
22
+
23
+
24
+ def load_model_from_config(config, ckpt, device, verbose=False):
25
+ print(f'Loading model from {ckpt}')
26
+ pl_sd = torch.load(ckpt, map_location=device)
27
+ if 'global_step' in pl_sd:
28
+ print(f'Global Step: {pl_sd["global_step"]}')
29
+ sd = pl_sd['state_dict']
30
+ model = instantiate_from_config(config.model)
31
+ m, u = model.load_state_dict(sd, strict=False)
32
+ if len(m) > 0 and verbose:
33
+ print('missing keys:')
34
+ print(m)
35
+ if len(u) > 0 and verbose:
36
+ print('unexpected keys:')
37
+ print(u)
38
+
39
+ model.to(device)
40
+ model.eval()
41
+ return model
42
+
43
+
44
+ @torch.no_grad()
45
+ def sample_model(input_im, model, sampler, precision, h, w, ddim_steps, n_samples, scale,
46
+ ddim_eta, x, y, z):
47
+ precision_scope = autocast if precision == 'autocast' else nullcontext
48
+ with precision_scope('cuda'):
49
+ with model.ema_scope():
50
+ c = model.get_learned_conditioning(input_im).tile(n_samples, 1, 1)
51
+ T = torch.tensor([math.radians(x), math.sin(
52
+ math.radians(y)), math.cos(math.radians(y)), z])
53
+ T = T[None, None, :].repeat(n_samples, 1, 1).to(c.device)
54
+ c = torch.cat([c, T], dim=-1)
55
+ c = model.cc_projection(c)
56
+ cond = {}
57
+ cond['c_crossattn'] = [c]
58
+ c_concat = model.encode_first_stage((input_im.to(c.device))).mode().detach()
59
+ cond['c_concat'] = [model.encode_first_stage((input_im.to(c.device))).mode().detach()
60
+ .repeat(n_samples, 1, 1, 1)]
61
+ if scale != 1.0:
62
+ uc = {}
63
+ uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)]
64
+ uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)]
65
+ else:
66
+ uc = None
67
+
68
+ shape = [4, h // 8, w // 8]
69
+ samples_ddim, _ = sampler.sample(S=ddim_steps,
70
+ conditioning=cond,
71
+ batch_size=n_samples,
72
+ shape=shape,
73
+ verbose=False,
74
+ unconditional_guidance_scale=scale,
75
+ unconditional_conditioning=uc,
76
+ eta=ddim_eta,
77
+ x_T=None)
78
+ print(samples_ddim.shape)
79
+ # samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False)
80
+ x_samples_ddim = model.decode_first_stage(samples_ddim)
81
+ return torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu()
82
+
83
+
84
+ def main(
85
+ model,
86
+ device,
87
+ input_im,
88
+ preprocess=True,
89
+ x=0.,
90
+ y=0.,
91
+ z=0.,
92
+ scale=3.0,
93
+ n_samples=4,
94
+ ddim_steps=50,
95
+ ddim_eta=1.0,
96
+ precision='fp32',
97
+ h=256,
98
+ w=256,
99
+ ):
100
+ # input_im[input_im == [0., 0., 0.]] = [1., 1., 1., 1.]
101
+ print('old input_im:', input_im.size)
102
+
103
+ if preprocess:
104
+ input_im = load_and_preprocess(input_im)
105
+ input_im = (input_im / 255.0).astype(np.float32)
106
+ # (H, W, 3) array in [0, 1].
107
+
108
+ else:
109
+ input_im = input_im.resize([256, 256], Image.Resampling.LANCZOS)
110
+ input_im = np.asarray(input_im, dtype=np.float32) / 255.0
111
+ # (H, W, 4) array in [0, 1].
112
+
113
+ # old method: very important, thresholding background
114
+ # input_im[input_im[:, :, -1] <= 0.9] = [1., 1., 1., 1.]
115
+
116
+ # new method: apply correct method of compositing to avoid sudden transitions / thresholding
117
+ # (smoothly transition foreground to white background based on alpha values)
118
+ alpha = input_im[:, :, 3:4]
119
+ white_im = np.ones_like(input_im)
120
+ input_im = alpha * input_im + (1.0 - alpha) * white_im
121
+
122
+ input_im = input_im[:, :, 0:3]
123
+ # (H, W, 3) array in [0, 1].
124
+
125
+ print('new input_im:', input_im.shape, input_im.dtype, input_im.min(), input_im.max())
126
+ show_in_im = Image.fromarray((input_im * 255).astype(np.uint8))
127
+
128
+ input_im = transforms.ToTensor()(input_im).unsqueeze(0).to(device)
129
+ input_im = input_im * 2 - 1
130
+ input_im = transforms.functional.resize(input_im, [h, w])
131
+
132
+ sampler = DDIMSampler(model)
133
+ x_samples_ddim = sample_model(input_im, model, sampler, precision, h, w,
134
+ ddim_steps, n_samples, scale, ddim_eta, x, y, z)
135
+
136
+ output_ims = []
137
+ for x_sample in x_samples_ddim:
138
+ x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
139
+ output_ims.append(Image.fromarray(x_sample.astype(np.uint8)))
140
+
141
+ if _SHOW_INTERMEDIATE:
142
+ return (output_ims, show_in_im)
143
+ else:
144
+ return output_ims
145
+
146
+
147
+ description = '''
148
+ Generate novel viewpoints of an object depicted in one input image using a fine-tuned version of Stable Diffusion.
149
+ '''
150
+
151
+ article = '''
152
+ ## How to use this?
153
+ TBD
154
+ ## How does this work?
155
+ TBD
156
+ '''
157
+
158
+
159
+ def run_demo(
160
+ device_idx=_GPU_INDEX,
161
+ ckpt='last.ckpt',
162
+ config='configs/sd-objaverse-finetune-c_concat-256.yaml',
163
+ ):
164
+
165
+ device = f'cuda:{device_idx}'
166
+ config = OmegaConf.load(config)
167
+ model = load_model_from_config(config, ckpt, device=device)
168
+
169
+ inputs = [
170
+ gr.Image(type='pil', image_mode='RGBA'), # shape=[512, 512]
171
+ gr.Checkbox(True, label='Preprocess image (remove background and center)',
172
+ info='If enabled, the uploaded image will be preprocessed to remove the background and center the object by cropping and/or padding as necessary. '
173
+ 'If disabled, the image will be used as-is, *BUT* a fully transparent or white background is required.'),
174
+ # gr.Number(label='polar (between axis z+)'),
175
+ # gr.Number(label='azimuth (between axis x+)'),
176
+ # gr.Number(label='z (distance from center)'),
177
+ gr.Slider(-90, 90, value=0, step=5, label='Polar angle (vertical rotation in degrees)',
178
+ info='Positive values move the camera down, while negative values move the camera up.'),
179
+ gr.Slider(-90, 90, value=0, step=5, label='Azimuth angle (horizontal rotation in degrees)',
180
+ info='Positive values move the camera right, while negative values move the camera left.'),
181
+ gr.Slider(-2, 2, value=0, step=0.5, label='Radius (distance from center)',
182
+ info='Positive values move the camera further away, while negative values move the camera closer.'),
183
+ gr.Slider(0, 30, value=3, step=1, label='cfg scale'),
184
+ gr.Slider(1, 8, value=4, step=1, label='Number of samples to generate'),
185
+ gr.Slider(5, 200, value=100, step=5, label='Number of steps'),
186
+ ]
187
+ output = [gr.Gallery(label='Generated images from specified new viewpoint')]
188
+ output[0].style(grid=2)
189
+
190
+ if _SHOW_INTERMEDIATE:
191
+ output += [gr.Image(type='pil', image_mode='RGB', label='Preprocessed input image')]
192
+
193
+ fn_with_model = partial(main, model, device)
194
+ fn_with_model.__name__ = 'fn_with_model'
195
+
196
+ examples = [
197
+ # ['assets/zero-shot/bear.png', 0, 0, 0, 3, 4, 100],
198
+ # ['assets/zero-shot/car.png', 0, 0, 0, 3, 4, 100],
199
+ # ['assets/zero-shot/elephant.png', 0, 0, 0, 3, 4, 100],
200
+ # ['assets/zero-shot/pikachu.png', 0, 0, 0, 3, 4, 100],
201
+ # ['assets/zero-shot/spyro.png', 0, 0, 0, 3, 4, 100],
202
+ # ['assets/zero-shot/taxi.png', 0, 0, 0, 3, 4, 100],
203
+ ]
204
+
205
+ demo = gr.Interface(
206
+ fn=fn_with_model,
207
+ title='Demo for Zero-Shot Control of Camera Viewpoints within a Single Image',
208
+ description=description,
209
+ article=article,
210
+ inputs=inputs,
211
+ outputs=output,
212
+ examples=examples,
213
+ allow_flagging='never',
214
+ )
215
+ demo.launch(enable_queue=True, share=True)
216
+
217
+
218
+ if __name__ == '__main__':
219
+ fire.Fire(run_demo)
configs/sd-objaverse-finetune-c_concat-256.yaml ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "image_target"
11
+ cond_stage_key: "image_cond"
12
+ image_size: 32
13
+ channels: 4
14
+ cond_stage_trainable: false # Note: different from the one we trained before
15
+ conditioning_key: hybrid
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+
19
+ scheduler_config: # 10000 warmup steps
20
+ target: ldm.lr_scheduler.LambdaLinearScheduler
21
+ params:
22
+ warm_up_steps: [ 100 ]
23
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
24
+ f_start: [ 1.e-6 ]
25
+ f_max: [ 1. ]
26
+ f_min: [ 1. ]
27
+
28
+ unet_config:
29
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
30
+ params:
31
+ image_size: 32 # unused
32
+ in_channels: 8
33
+ out_channels: 4
34
+ model_channels: 320
35
+ attention_resolutions: [ 4, 2, 1 ]
36
+ num_res_blocks: 2
37
+ channel_mult: [ 1, 2, 4, 4 ]
38
+ num_heads: 8
39
+ use_spatial_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 768
42
+ use_checkpoint: True
43
+ legacy: False
44
+
45
+ first_stage_config:
46
+ target: ldm.models.autoencoder.AutoencoderKL
47
+ params:
48
+ embed_dim: 4
49
+ monitor: val/rec_loss
50
+ ddconfig:
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: []
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenCLIPImageEmbedder
70
+
71
+
72
+ data:
73
+ target: ldm.data.simple.ObjaverseDataModuleFromConfig
74
+ params:
75
+ root_dir: 'views_whole_sphere'
76
+ batch_size: 192
77
+ num_workers: 16
78
+ total_view: 4
79
+ train:
80
+ validation: False
81
+ image_transforms:
82
+ size: 256
83
+
84
+ validation:
85
+ validation: True
86
+ image_transforms:
87
+ size: 256
88
+
89
+
90
+ lightning:
91
+ find_unused_parameters: false
92
+ metrics_over_trainsteps_checkpoint: True
93
+ modelcheckpoint:
94
+ params:
95
+ every_n_train_steps: 5000
96
+ callbacks:
97
+ image_logger:
98
+ target: main.ImageLogger
99
+ params:
100
+ batch_frequency: 500
101
+ max_images: 32
102
+ increase_log_steps: False
103
+ log_first_step: True
104
+ log_images_kwargs:
105
+ use_ema_scope: False
106
+ inpaint: False
107
+ plot_progressive_rows: False
108
+ plot_diffusion_rows: False
109
+ N: 32
110
+ unconditional_guidance_scale: 3.0
111
+ unconditional_guidance_label: [""]
112
+
113
+ trainer:
114
+ benchmark: True
115
+ val_check_interval: 5000000 # really sorry
116
+ num_sanity_val_steps: 0
117
+ accumulate_grad_batches: 1
gradio_new.py ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ conda activate zero123
3
+ cd stable-diffusion
4
+ python gradio_new.py 0
5
+ '''
6
+
7
+ import diffusers # 0.12.1
8
+ import math
9
+ import fire
10
+ import gradio as gr
11
+ import lovely_numpy
12
+ import lovely_tensors
13
+ import numpy as np
14
+ import plotly.express as px
15
+ import plotly.graph_objects as go
16
+ import rich
17
+ import sys
18
+ import time
19
+ import torch
20
+ from contextlib import nullcontext
21
+ from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
22
+ from einops import rearrange
23
+ from functools import partial
24
+ from ldm.models.diffusion.ddim import DDIMSampler
25
+ from ldm.util import create_carvekit_interface, load_and_preprocess, instantiate_from_config
26
+ from lovely_numpy import lo
27
+ from omegaconf import OmegaConf
28
+ from PIL import Image
29
+ from rich import print
30
+ from transformers import AutoFeatureExtractor #, CLIPImageProcessor
31
+ from torch import autocast
32
+ from torchvision import transforms
33
+
34
+
35
+ _SHOW_DESC = True
36
+ _SHOW_INTERMEDIATE = False
37
+ # _SHOW_INTERMEDIATE = True
38
+ _GPU_INDEX = 0
39
+ # _GPU_INDEX = 2
40
+
41
+ # _TITLE = 'Zero-Shot Control of Camera Viewpoints within a Single Image'
42
+ _TITLE = 'Zero-1-to-3: Zero-shot One Image to 3D Object'
43
+
44
+ # This demo allows you to generate novel viewpoints of an object depicted in an input image using a fine-tuned version of Stable Diffusion.
45
+ _DESCRIPTION = '''
46
+ This demo allows you to control camera rotation and thereby generate novel viewpoints of an object within a single image.
47
+ It is based on Stable Diffusion. Check out our [project webpage](https://zero123.cs.columbia.edu/) and [paper](https://arxiv.org/) if you want to learn more about the method!
48
+ Note that this model is not intended for images of humans or faces, and is unlikely to work well for them.
49
+ '''
50
+
51
+ _ARTICLE = 'See uses.md'
52
+
53
+
54
+ def load_model_from_config(config, ckpt, device, verbose=False):
55
+ print(f'Loading model from {ckpt}')
56
+ pl_sd = torch.load(ckpt, map_location=device)
57
+ if 'global_step' in pl_sd:
58
+ print(f'Global Step: {pl_sd["global_step"]}')
59
+ sd = pl_sd['state_dict']
60
+ model = instantiate_from_config(config.model)
61
+ m, u = model.load_state_dict(sd, strict=False)
62
+ if len(m) > 0 and verbose:
63
+ print('missing keys:')
64
+ print(m)
65
+ if len(u) > 0 and verbose:
66
+ print('unexpected keys:')
67
+ print(u)
68
+
69
+ model.to(device)
70
+ model.eval()
71
+ return model
72
+
73
+
74
+ @torch.no_grad()
75
+ def sample_model(input_im, model, sampler, precision, h, w, ddim_steps, n_samples, scale,
76
+ ddim_eta, x, y, z):
77
+ precision_scope = autocast if precision == 'autocast' else nullcontext
78
+ with precision_scope('cuda'):
79
+ with model.ema_scope():
80
+ c = model.get_learned_conditioning(input_im).tile(n_samples, 1, 1)
81
+ T = torch.tensor([math.radians(x), math.sin(
82
+ math.radians(y)), math.cos(math.radians(y)), z])
83
+ T = T[None, None, :].repeat(n_samples, 1, 1).to(c.device)
84
+ c = torch.cat([c, T], dim=-1)
85
+ c = model.cc_projection(c)
86
+ cond = {}
87
+ cond['c_crossattn'] = [c]
88
+ c_concat = model.encode_first_stage((input_im.to(c.device))).mode().detach()
89
+ cond['c_concat'] = [model.encode_first_stage((input_im.to(c.device))).mode().detach()
90
+ .repeat(n_samples, 1, 1, 1)]
91
+ if scale != 1.0:
92
+ uc = {}
93
+ uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)]
94
+ uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)]
95
+ else:
96
+ uc = None
97
+
98
+ shape = [4, h // 8, w // 8]
99
+ samples_ddim, _ = sampler.sample(S=ddim_steps,
100
+ conditioning=cond,
101
+ batch_size=n_samples,
102
+ shape=shape,
103
+ verbose=False,
104
+ unconditional_guidance_scale=scale,
105
+ unconditional_conditioning=uc,
106
+ eta=ddim_eta,
107
+ x_T=None)
108
+ print(samples_ddim.shape)
109
+ # samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False)
110
+ x_samples_ddim = model.decode_first_stage(samples_ddim)
111
+ return torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu()
112
+
113
+
114
+ class CameraVisualizer:
115
+ def __init__(self, gradio_plot):
116
+ self._gradio_plot = gradio_plot
117
+ self._fig = None
118
+ self._polar = 0.0
119
+ self._azimuth = 0.0
120
+ self._radius = 0.0
121
+ self._raw_image = None
122
+ self._8bit_image = None
123
+ self._image_colorscale = None
124
+
125
+ def polar_change(self, value):
126
+ self._polar = value
127
+ # return self.update_figure()
128
+
129
+ def azimuth_change(self, value):
130
+ self._azimuth = value
131
+ # return self.update_figure()
132
+
133
+ def radius_change(self, value):
134
+ self._radius = value
135
+ # return self.update_figure()
136
+
137
+ def encode_image(self, raw_image):
138
+ '''
139
+ :param raw_image (H, W, 3) array of uint8 in [0, 255].
140
+ '''
141
+ # https://stackoverflow.com/questions/60685749/python-plotly-how-to-add-an-image-to-a-3d-scatter-plot
142
+
143
+ dum_img = Image.fromarray(np.ones((3, 3, 3), dtype='uint8')).convert('P', palette='WEB')
144
+ idx_to_color = np.array(dum_img.getpalette()).reshape((-1, 3))
145
+
146
+ self._raw_image = raw_image
147
+ self._8bit_image = Image.fromarray(raw_image).convert('P', palette='WEB', dither=None)
148
+ # self._8bit_image = Image.fromarray(raw_image.clip(0, 254)).convert(
149
+ # 'P', palette='WEB', dither=None)
150
+ self._image_colorscale = [
151
+ [i / 255.0, 'rgb({}, {}, {})'.format(*rgb)] for i, rgb in enumerate(idx_to_color)]
152
+
153
+ # return self.update_figure()
154
+
155
+ def update_figure(self):
156
+ fig = go.Figure()
157
+
158
+ if self._raw_image is not None:
159
+ (H, W, C) = self._raw_image.shape
160
+
161
+ x = np.zeros((H, W))
162
+ (y, z) = np.meshgrid(np.linspace(-1.0, 1.0, W), np.linspace(1.0, -1.0, H) * H / W)
163
+ print('x:', lo(x))
164
+ print('y:', lo(y))
165
+ print('z:', lo(z))
166
+
167
+ fig.add_trace(go.Surface(
168
+ x=x, y=y, z=z,
169
+ surfacecolor=self._8bit_image,
170
+ cmin=0,
171
+ cmax=255,
172
+ colorscale=self._image_colorscale,
173
+ showscale=False,
174
+ lighting_diffuse=1.0,
175
+ lighting_ambient=1.0,
176
+ lighting_fresnel=1.0,
177
+ lighting_roughness=1.0,
178
+ lighting_specular=0.3))
179
+
180
+ scene_bounds = 3.5
181
+ base_radius = 2.5
182
+ zoom_scale = 1.5 # Note that input radius offset is in [-0.5, 0.5].
183
+ fov_deg = 50.0
184
+ edges = [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (2, 3), (3, 4), (4, 1)]
185
+
186
+ input_cone = calc_cam_cone_pts_3d(
187
+ 0.0, 0.0, base_radius, fov_deg) # (5, 3).
188
+ output_cone = calc_cam_cone_pts_3d(
189
+ self._polar, self._azimuth, base_radius + self._radius * zoom_scale, fov_deg) # (5, 3).
190
+ # print('input_cone:', lo(input_cone).v)
191
+ # print('output_cone:', lo(output_cone).v)
192
+
193
+ for (cone, clr, legend) in [(input_cone, 'green', 'Input view'),
194
+ (output_cone, 'blue', 'Target view')]:
195
+
196
+ for (i, edge) in enumerate(edges):
197
+ (x1, x2) = (cone[edge[0], 0], cone[edge[1], 0])
198
+ (y1, y2) = (cone[edge[0], 1], cone[edge[1], 1])
199
+ (z1, z2) = (cone[edge[0], 2], cone[edge[1], 2])
200
+ fig.add_trace(go.Scatter3d(
201
+ x=[x1, x2], y=[y1, y2], z=[z1, z2], mode='lines',
202
+ line=dict(color=clr, width=3),
203
+ name=legend, showlegend=(i == 0)))
204
+ # text=(legend if i == 0 else None),
205
+ # textposition='bottom center'))
206
+ # hoverinfo='text',
207
+ # hovertext='hovertext'))
208
+
209
+ # Add label.
210
+ if cone[0, 2] <= base_radius / 2.0:
211
+ fig.add_trace(go.Scatter3d(
212
+ x=[cone[0, 0]], y=[cone[0, 1]], z=[cone[0, 2] - 0.05], showlegend=False,
213
+ mode='text', text=legend, textposition='bottom center'))
214
+ else:
215
+ fig.add_trace(go.Scatter3d(
216
+ x=[cone[0, 0]], y=[cone[0, 1]], z=[cone[0, 2] + 0.05], showlegend=False,
217
+ mode='text', text=legend, textposition='top center'))
218
+
219
+ # look at center of scene
220
+ fig.update_layout(
221
+ # width=640,
222
+ # height=480,
223
+ # height=400,
224
+ height=360,
225
+ autosize=True,
226
+ hovermode=False,
227
+ margin=go.layout.Margin(l=0, r=0, b=0, t=0),
228
+ showlegend=True,
229
+ legend=dict(
230
+ yanchor='bottom',
231
+ y=0.01,
232
+ xanchor='right',
233
+ x=0.99,
234
+ ),
235
+ scene=dict(
236
+ aspectmode='manual',
237
+ aspectratio=dict(x=1, y=1, z=1.0),
238
+ camera=dict(
239
+ eye=dict(x=base_radius - 1.6, y=0.0, z=0.6),
240
+ center=dict(x=0.0, y=0.0, z=0.0),
241
+ up=dict(x=0.0, y=0.0, z=1.0)),
242
+ xaxis_title='',
243
+ yaxis_title='',
244
+ zaxis_title='',
245
+ xaxis=dict(
246
+ range=[-scene_bounds, scene_bounds],
247
+ showticklabels=False,
248
+ showgrid=True,
249
+ zeroline=False,
250
+ showbackground=True,
251
+ showspikes=False,
252
+ showline=False,
253
+ ticks=''),
254
+ yaxis=dict(
255
+ range=[-scene_bounds, scene_bounds],
256
+ showticklabels=False,
257
+ showgrid=True,
258
+ zeroline=False,
259
+ showbackground=True,
260
+ showspikes=False,
261
+ showline=False,
262
+ ticks=''),
263
+ zaxis=dict(
264
+ range=[-scene_bounds, scene_bounds],
265
+ showticklabels=False,
266
+ showgrid=True,
267
+ zeroline=False,
268
+ showbackground=True,
269
+ showspikes=False,
270
+ showline=False,
271
+ ticks='')))
272
+
273
+ self._fig = fig
274
+ return fig
275
+
276
+
277
+ def preprocess_image(models, input_im, preprocess):
278
+ '''
279
+ :param input_im (PIL Image).
280
+ :return input_im (H, W, 3) array in [0, 1].
281
+ '''
282
+
283
+ print('old input_im:', input_im.size)
284
+ start_time = time.time()
285
+
286
+ if preprocess:
287
+ input_im = load_and_preprocess(models['carvekit'], input_im)
288
+ input_im = (input_im / 255.0).astype(np.float32)
289
+ # (H, W, 3) array in [0, 1].
290
+
291
+ else:
292
+ input_im = input_im.resize([256, 256], Image.Resampling.LANCZOS)
293
+ input_im = np.asarray(input_im, dtype=np.float32) / 255.0
294
+ # (H, W, 4) array in [0, 1].
295
+
296
+ # old method: thresholding background, very important
297
+ # input_im[input_im[:, :, -1] <= 0.9] = [1., 1., 1., 1.]
298
+
299
+ # new method: apply correct method of compositing to avoid sudden transitions / thresholding
300
+ # (smoothly transition foreground to white background based on alpha values)
301
+ alpha = input_im[:, :, 3:4]
302
+ white_im = np.ones_like(input_im)
303
+ input_im = alpha * input_im + (1.0 - alpha) * white_im
304
+
305
+ input_im = input_im[:, :, 0:3]
306
+ # (H, W, 3) array in [0, 1].
307
+
308
+ print(f'Infer foreground mask (preprocess_image) took {time.time() - start_time:.3f}s.')
309
+ print('new input_im:', lo(input_im))
310
+
311
+ return input_im
312
+
313
+
314
+ def main_run(models, device, cam_vis, return_what,
315
+ x=0.0, y=0.0, z=0.0,
316
+ raw_im=None, preprocess=True,
317
+ scale=3.0, n_samples=4, ddim_steps=50, ddim_eta=1.0,
318
+ precision='fp32', h=256, w=256):
319
+ '''
320
+ :param raw_im (PIL Image).
321
+ '''
322
+
323
+ safety_checker_input = models['clip_fe'](raw_im, return_tensors='pt').to(device)
324
+ (image, has_nsfw_concept) = models['nsfw'](
325
+ images=np.ones((1, 3)), clip_input=safety_checker_input.pixel_values)
326
+ print('has_nsfw_concept:', has_nsfw_concept)
327
+ if np.any(has_nsfw_concept):
328
+ print('NSFW content detected.')
329
+ to_return = [None] * 10
330
+ description = ('### <span style="color:red"> Unfortunately, '
331
+ 'potential NSFW content was detected, '
332
+ 'which is not supported by our model. '
333
+ 'Please try again with a different image. </span>')
334
+ if 'angles' in return_what:
335
+ to_return[0] = 0.0
336
+ to_return[1] = 0.0
337
+ to_return[2] = 0.0
338
+ to_return[3] = description
339
+ else:
340
+ to_return[0] = description
341
+ return to_return
342
+
343
+ else:
344
+ print('Safety check passed.')
345
+
346
+ input_im = preprocess_image(models, raw_im, preprocess)
347
+
348
+ # if np.random.rand() < 0.3:
349
+ # description = ('Unfortunately, a human, a face, or potential NSFW content was detected, '
350
+ # 'which is not supported by our model.')
351
+ # if vis_only:
352
+ # return (None, None, description)
353
+ # else:
354
+ # return (None, None, None, description)
355
+
356
+ show_in_im1 = (input_im * 255.0).astype(np.uint8)
357
+ show_in_im2 = Image.fromarray(show_in_im1)
358
+
359
+ if 'rand' in return_what:
360
+ x = int(np.round(np.arcsin(np.random.uniform(-1.0, 1.0)) * 160.0 / np.pi)) # [-80, 80].
361
+ y = int(np.round(np.random.uniform(-150.0, 150.0)))
362
+ z = 0.0
363
+
364
+ cam_vis.polar_change(x)
365
+ cam_vis.azimuth_change(y)
366
+ cam_vis.radius_change(z)
367
+ cam_vis.encode_image(show_in_im1)
368
+ new_fig = cam_vis.update_figure()
369
+
370
+ if 'vis' in return_what:
371
+ description = ('The viewpoints are visualized on the top right. '
372
+ 'Click Run Generation to update the results on the bottom right.')
373
+
374
+ if 'angles' in return_what:
375
+ return (x, y, z, description, new_fig, show_in_im2)
376
+ else:
377
+ return (description, new_fig, show_in_im2)
378
+
379
+ elif 'gen' in return_what:
380
+ input_im = transforms.ToTensor()(input_im).unsqueeze(0).to(device)
381
+ input_im = input_im * 2 - 1
382
+ input_im = transforms.functional.resize(input_im, [h, w])
383
+
384
+ sampler = DDIMSampler(models['turncam'])
385
+ # used_x = -x # NOTE: Polar makes more sense in Basile's opinion this way!
386
+ used_x = x # NOTE: Set this way for consistency.
387
+ x_samples_ddim = sample_model(input_im, models['turncam'], sampler, precision, h, w,
388
+ ddim_steps, n_samples, scale, ddim_eta, used_x, y, z)
389
+
390
+ output_ims = []
391
+ for x_sample in x_samples_ddim:
392
+ x_sample = 255.0 * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
393
+ output_ims.append(Image.fromarray(x_sample.astype(np.uint8)))
394
+
395
+ description = None
396
+
397
+ if 'angles' in return_what:
398
+ return (x, y, z, description, new_fig, show_in_im2, output_ims)
399
+ else:
400
+ return (description, new_fig, show_in_im2, output_ims)
401
+
402
+
403
+ def calc_cam_cone_pts_3d(polar_deg, azimuth_deg, radius_m, fov_deg):
404
+ '''
405
+ :param polar_deg (float).
406
+ :param azimuth_deg (float).
407
+ :param radius_m (float).
408
+ :param fov_deg (float).
409
+ :return (5, 3) array of float with (x, y, z).
410
+ '''
411
+ polar_rad = np.deg2rad(polar_deg)
412
+ azimuth_rad = np.deg2rad(azimuth_deg)
413
+ fov_rad = np.deg2rad(fov_deg)
414
+ polar_rad = -polar_rad # NOTE: Inverse of how used_x relates to x.
415
+
416
+ # Camera pose center:
417
+ cam_x = radius_m * np.cos(azimuth_rad) * np.cos(polar_rad)
418
+ cam_y = radius_m * np.sin(azimuth_rad) * np.cos(polar_rad)
419
+ cam_z = radius_m * np.sin(polar_rad)
420
+
421
+ # Obtain four corners of camera frustum, assuming it is looking at origin.
422
+ # First, obtain camera extrinsics (rotation matrix only):
423
+ camera_R = np.array([[np.cos(azimuth_rad) * np.cos(polar_rad),
424
+ -np.sin(azimuth_rad),
425
+ -np.cos(azimuth_rad) * np.sin(polar_rad)],
426
+ [np.sin(azimuth_rad) * np.cos(polar_rad),
427
+ np.cos(azimuth_rad),
428
+ -np.sin(azimuth_rad) * np.sin(polar_rad)],
429
+ [np.sin(polar_rad),
430
+ 0.0,
431
+ np.cos(polar_rad)]])
432
+ # print('camera_R:', lo(camera_R).v)
433
+
434
+ # Multiply by corners in camera space to obtain go to space:
435
+ corn1 = [-1.0, np.tan(fov_rad / 2.0), np.tan(fov_rad / 2.0)]
436
+ corn2 = [-1.0, -np.tan(fov_rad / 2.0), np.tan(fov_rad / 2.0)]
437
+ corn3 = [-1.0, -np.tan(fov_rad / 2.0), -np.tan(fov_rad / 2.0)]
438
+ corn4 = [-1.0, np.tan(fov_rad / 2.0), -np.tan(fov_rad / 2.0)]
439
+ corn1 = np.dot(camera_R, corn1)
440
+ corn2 = np.dot(camera_R, corn2)
441
+ corn3 = np.dot(camera_R, corn3)
442
+ corn4 = np.dot(camera_R, corn4)
443
+
444
+ # Now attach as offset to actual 3D camera position:
445
+ corn1 = np.array(corn1) / np.linalg.norm(corn1, ord=2)
446
+ corn_x1 = cam_x + corn1[0]
447
+ corn_y1 = cam_y + corn1[1]
448
+ corn_z1 = cam_z + corn1[2]
449
+ corn2 = np.array(corn2) / np.linalg.norm(corn2, ord=2)
450
+ corn_x2 = cam_x + corn2[0]
451
+ corn_y2 = cam_y + corn2[1]
452
+ corn_z2 = cam_z + corn2[2]
453
+ corn3 = np.array(corn3) / np.linalg.norm(corn3, ord=2)
454
+ corn_x3 = cam_x + corn3[0]
455
+ corn_y3 = cam_y + corn3[1]
456
+ corn_z3 = cam_z + corn3[2]
457
+ corn4 = np.array(corn4) / np.linalg.norm(corn4, ord=2)
458
+ corn_x4 = cam_x + corn4[0]
459
+ corn_y4 = cam_y + corn4[1]
460
+ corn_z4 = cam_z + corn4[2]
461
+
462
+ xs = [cam_x, corn_x1, corn_x2, corn_x3, corn_x4]
463
+ ys = [cam_y, corn_y1, corn_y2, corn_y3, corn_y4]
464
+ zs = [cam_z, corn_z1, corn_z2, corn_z3, corn_z4]
465
+
466
+ return np.array([xs, ys, zs]).T
467
+
468
+
469
+ def run_demo(
470
+ device_idx=_GPU_INDEX,
471
+ ckpt='105000.ckpt',
472
+ config='configs/sd-objaverse-finetune-c_concat-256.yaml'):
473
+
474
+ print('sys.argv:', sys.argv)
475
+ if len(sys.argv) > 1:
476
+ print('old device_idx:', device_idx)
477
+ device_idx = int(sys.argv[1])
478
+ print('new device_idx:', device_idx)
479
+
480
+ device = f'cuda:{device_idx}'
481
+ config = OmegaConf.load(config)
482
+
483
+ # Instantiate all models beforehand for efficiency.
484
+ models = dict()
485
+ print('Instantiating LatentDiffusion...')
486
+ models['turncam'] = load_model_from_config(config, ckpt, device=device)
487
+ print('Instantiating Carvekit HiInterface...')
488
+ models['carvekit'] = create_carvekit_interface()
489
+ print('Instantiating StableDiffusionSafetyChecker...')
490
+ models['nsfw'] = StableDiffusionSafetyChecker.from_pretrained(
491
+ 'CompVis/stable-diffusion-safety-checker').to(device)
492
+ print('Instantiating AutoFeatureExtractor...')
493
+ models['clip_fe'] = AutoFeatureExtractor.from_pretrained(
494
+ 'CompVis/stable-diffusion-safety-checker')
495
+
496
+ # Reduce NSFW false positives.
497
+ # NOTE: At the time of writing, and for diffusers 0.12.1, the default parameters are:
498
+ # models['nsfw'].concept_embeds_weights:
499
+ # [0.1800, 0.1900, 0.2060, 0.2100, 0.1950, 0.1900, 0.1940, 0.1900, 0.1900, 0.2200, 0.1900,
500
+ # 0.1900, 0.1950, 0.1984, 0.2100, 0.2140, 0.2000].
501
+ # models['nsfw'].special_care_embeds_weights:
502
+ # [0.1950, 0.2000, 0.2200].
503
+ # We multiply all by some factor > 1 to make them less likely to be triggered.
504
+ models['nsfw'].concept_embeds_weights *= 1.07
505
+ models['nsfw'].special_care_embeds_weights *= 1.07
506
+
507
+ with open('instructions.md', 'r') as f:
508
+ article = f.read()
509
+
510
+ # Compose demo layout & data flow.
511
+ demo = gr.Blocks(title=_TITLE)
512
+
513
+ with demo:
514
+ gr.Markdown('# ' + _TITLE)
515
+ gr.Markdown(_DESCRIPTION)
516
+
517
+ with gr.Row():
518
+ with gr.Column(scale=0.9, variant='panel'):
519
+
520
+ image_block = gr.Image(type='pil', image_mode='RGBA',
521
+ label='Input image of single object')
522
+ preprocess_chk = gr.Checkbox(
523
+ True, label='Preprocess image automatically (remove background and recenter object)')
524
+ # info='If enabled, the uploaded image will be preprocessed to remove the background and recenter the object by cropping and/or padding as necessary. '
525
+ # 'If disabled, the image will be used as-is, *BUT* a fully transparent or white background is required.'),
526
+
527
+ gr.Markdown('*Try camera position presets:*')
528
+ with gr.Row():
529
+ left_btn = gr.Button('View from the Left', variant='primary')
530
+ above_btn = gr.Button('View from Above', variant='primary')
531
+ right_btn = gr.Button('View from the Right', variant='primary')
532
+ with gr.Row():
533
+ random_btn = gr.Button('Random Rotation', variant='primary')
534
+ below_btn = gr.Button('View from Below', variant='primary')
535
+ behind_btn = gr.Button('View from Behind', variant='primary')
536
+
537
+ gr.Markdown('*Control camera position manually:*')
538
+ polar_slider = gr.Slider(
539
+ -90, 90, value=0, step=5, label='Polar angle (vertical rotation in degrees)')
540
+ # info='Positive values move the camera down, while negative values move the camera up.')
541
+ azimuth_slider = gr.Slider(
542
+ -180, 180, value=0, step=5, label='Azimuth angle (horizontal rotation in degrees)')
543
+ # info='Positive values move the camera right, while negative values move the camera left.')
544
+ radius_slider = gr.Slider(
545
+ -0.5, 0.5, value=0.0, step=0.1, label='Zoom (relative distance from center)')
546
+ # info='Positive values move the camera further away, while negative values move the camera closer.')
547
+
548
+ samples_slider = gr.Slider(1, 8, value=4, step=1,
549
+ label='Number of samples to generate')
550
+
551
+ with gr.Accordion('Advanced options', open=False):
552
+ scale_slider = gr.Slider(0, 30, value=3, step=1,
553
+ label='Diffusion guidance scale')
554
+ steps_slider = gr.Slider(5, 200, value=75, step=5,
555
+ label='Number of diffusion inference steps')
556
+
557
+ with gr.Row():
558
+ vis_btn = gr.Button('Visualize Angles', variant='secondary')
559
+ run_btn = gr.Button('Run Generation', variant='primary')
560
+
561
+ desc_output = gr.Markdown('The results will appear on the right.', visible=_SHOW_DESC)
562
+
563
+ with gr.Column(scale=1.1, variant='panel'):
564
+
565
+ vis_output = gr.Plot(
566
+ label='Relationship between input (green) and output (blue) camera poses')
567
+
568
+ gen_output = gr.Gallery(label='Generated images from specified new viewpoint')
569
+ gen_output.style(grid=2)
570
+
571
+ preproc_output = gr.Image(type='pil', image_mode='RGB',
572
+ label='Preprocessed input image', visible=_SHOW_INTERMEDIATE)
573
+
574
+ gr.Markdown(article)
575
+
576
+ # NOTE: I am forced to update vis_output for these preset buttons,
577
+ # because otherwise the gradio plot always resets the plotly 3D viewpoint for some reason,
578
+ # which might confuse the user into thinking that the plot has been updated too.
579
+
580
+ # OLD 1:
581
+ # left_btn.click(fn=lambda: [0.0, -90.0], #, 0.0],
582
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #], radius_slider])
583
+ # above_btn.click(fn=lambda: [90.0, 0.0], #, 0.0],
584
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #, radius_slider])
585
+ # right_btn.click(fn=lambda: [0.0, 90.0], #, 0.0],
586
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #, radius_slider])
587
+ # random_btn.click(fn=lambda: [int(np.round(np.random.uniform(-60.0, 60.0))),
588
+ # int(np.round(np.random.uniform(-150.0, 150.0)))], #, 0.0],
589
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #, radius_slider])
590
+ # below_btn.click(fn=lambda: [-90.0, 0.0], #, 0.0],
591
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #, radius_slider])
592
+ # behind_btn.click(fn=lambda: [0.0, 180.0], #, 0.0],
593
+ # inputs=[], outputs=[polar_slider, azimuth_slider]), #, radius_slider])
594
+
595
+ # OLD 2:
596
+ # preset_text = ('You have selected a preset target camera view. '
597
+ # 'Now click Run Generation to update the results!')
598
+
599
+ # left_btn.click(fn=lambda: [0.0, -90.0, None, preset_text],
600
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
601
+ # above_btn.click(fn=lambda: [90.0, 0.0, None, preset_text],
602
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
603
+ # right_btn.click(fn=lambda: [0.0, 90.0, None, preset_text],
604
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
605
+ # random_btn.click(fn=lambda: [int(np.round(np.random.uniform(-60.0, 60.0))),
606
+ # int(np.round(np.random.uniform(-150.0, 150.0))),
607
+ # None, preset_text],
608
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
609
+ # below_btn.click(fn=lambda: [-90.0, 0.0, None, preset_text],
610
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
611
+ # behind_btn.click(fn=lambda: [0.0, 180.0, None, preset_text],
612
+ # inputs=[], outputs=[polar_slider, azimuth_slider, vis_output, desc_output])
613
+
614
+ # OLD 3 (does not work at all):
615
+ # def a():
616
+ # polar_slider.value = 77.7
617
+ # polar_slider.postprocess(77.7)
618
+ # print('testa')
619
+ # left_btn.click(fn=a)
620
+
621
+ cam_vis = CameraVisualizer(vis_output)
622
+
623
+ vis_btn.click(fn=partial(main_run, models, device, cam_vis, 'vis'),
624
+ inputs=[polar_slider, azimuth_slider, radius_slider,
625
+ image_block, preprocess_chk],
626
+ outputs=[desc_output, vis_output, preproc_output])
627
+
628
+ run_btn.click(fn=partial(main_run, models, device, cam_vis, 'gen'),
629
+ inputs=[polar_slider, azimuth_slider, radius_slider,
630
+ image_block, preprocess_chk,
631
+ scale_slider, samples_slider, steps_slider],
632
+ outputs=[desc_output, vis_output, preproc_output, gen_output])
633
+
634
+ # NEW:
635
+ preset_inputs = [image_block, preprocess_chk,
636
+ scale_slider, samples_slider, steps_slider]
637
+ preset_outputs = [polar_slider, azimuth_slider, radius_slider,
638
+ desc_output, vis_output, preproc_output, gen_output]
639
+ left_btn.click(fn=partial(main_run, models, device, cam_vis, 'angles_gen',
640
+ 0.0, -90.0, 0.0),
641
+ inputs=preset_inputs, outputs=preset_outputs)
642
+ above_btn.click(fn=partial(main_run, models, device, cam_vis, 'angles_gen',
643
+ -90.0, 0.0, 0.0),
644
+ inputs=preset_inputs, outputs=preset_outputs)
645
+ right_btn.click(fn=partial(main_run, models, device, cam_vis, 'angles_gen',
646
+ 0.0, 90.0, 0.0),
647
+ inputs=preset_inputs, outputs=preset_outputs)
648
+ random_btn.click(fn=partial(main_run, models, device, cam_vis, 'rand_angles_gen',
649
+ -1.0, -1.0, -1.0),
650
+ inputs=preset_inputs, outputs=preset_outputs)
651
+ below_btn.click(fn=partial(main_run, models, device, cam_vis, 'angles_gen',
652
+ 90.0, 0.0, 0.0),
653
+ inputs=preset_inputs, outputs=preset_outputs)
654
+ behind_btn.click(fn=partial(main_run, models, device, cam_vis, 'angles_gen',
655
+ 0.0, 180.0, 0.0),
656
+ inputs=preset_inputs, outputs=preset_outputs)
657
+
658
+ demo.launch(enable_queue=True, share=True)
659
+
660
+
661
+ if __name__ == '__main__':
662
+
663
+ fire.Fire(run_demo)
gradio_objaverse.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import nullcontext
2
+ from functools import partial
3
+
4
+ import math
5
+ import fire
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ from einops import rearrange
10
+ from ldm.models.diffusion.ddim import DDIMSampler
11
+ from omegaconf import OmegaConf
12
+ from PIL import Image
13
+ from torch import autocast
14
+ from torchvision import transforms
15
+ from ldm.util import load_and_preprocess, instantiate_from_config
16
+
17
+ def load_model_from_config(config, ckpt, device, verbose=False):
18
+ print(f"Loading model from {ckpt}")
19
+ pl_sd = torch.load(ckpt, map_location=device)
20
+ if "global_step" in pl_sd:
21
+ print(f"Global Step: {pl_sd['global_step']}")
22
+ sd = pl_sd["state_dict"]
23
+ model = instantiate_from_config(config.model)
24
+ m, u = model.load_state_dict(sd, strict=False)
25
+ if len(m) > 0 and verbose:
26
+ print("missing keys:")
27
+ print(m)
28
+ if len(u) > 0 and verbose:
29
+ print("unexpected keys:")
30
+ print(u)
31
+
32
+ model.to(device)
33
+ model.eval()
34
+ return model
35
+
36
+ @torch.no_grad()
37
+ def sample_model(input_im, model, sampler, precision, h, w, ddim_steps, n_samples, scale, \
38
+ ddim_eta, x, y, z):
39
+ precision_scope = autocast if precision=="autocast" else nullcontext
40
+ with precision_scope("cuda"):
41
+ with model.ema_scope():
42
+ c = model.get_learned_conditioning(input_im).tile(n_samples,1,1)
43
+ T = torch.tensor([math.radians(x), math.sin(math.radians(y)), math.cos(math.radians(y)), z])
44
+ T = T[None, None, :].repeat(n_samples, 1, 1).to(c.device)
45
+ c = torch.cat([c, T], dim=-1)
46
+ c = model.cc_projection(c)
47
+ cond = {}
48
+ cond['c_crossattn'] = [c]
49
+ c_concat = model.encode_first_stage((input_im.to(c.device))).mode().detach()
50
+ cond['c_concat'] = [model.encode_first_stage((input_im.to(c.device))).mode().detach()\
51
+ .repeat(n_samples, 1, 1, 1)]
52
+ if scale != 1.0:
53
+ uc = {}
54
+ uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)]
55
+ uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)]
56
+ else:
57
+ uc = None
58
+
59
+ shape = [4, h // 8, w // 8]
60
+ samples_ddim, _ = sampler.sample(S=ddim_steps,
61
+ conditioning=cond,
62
+ batch_size=n_samples,
63
+ shape=shape,
64
+ verbose=False,
65
+ unconditional_guidance_scale=scale,
66
+ unconditional_conditioning=uc,
67
+ eta=ddim_eta,
68
+ x_T=None)
69
+ print(samples_ddim.shape)
70
+ # samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False)
71
+ x_samples_ddim = model.decode_first_stage(samples_ddim)
72
+ return torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu()
73
+
74
+
75
+ def main(
76
+ model,
77
+ device,
78
+ input_im,
79
+ x=0.,
80
+ y=0.,
81
+ z=0.,
82
+ scale=3.0,
83
+ n_samples=4,
84
+ ddim_steps=50,
85
+ preprocess=True,
86
+ ddim_eta=1.0,
87
+ precision="fp32",
88
+ h=256,
89
+ w=256,
90
+ ):
91
+ # input_im[input_im == [0., 0., 0.]] = [1., 1., 1., 1.]
92
+ print(input_im.size)
93
+ if preprocess:
94
+ input_im = load_and_preprocess(input_im)
95
+ else:
96
+ input_im = input_im.resize([256, 256], Image.Resampling.LANCZOS)
97
+ input_im = np.asarray(input_im, dtype=np.float32) / 255.
98
+ input_im[input_im[:, :, -1] <= 0.9] = [1., 1., 1., 1.] # very important, thresholding background
99
+ input_im = input_im[:, :, :3]
100
+ print(input_im.shape)
101
+ input_im = transforms.ToTensor()(input_im).unsqueeze(0).to(device)
102
+ input_im = input_im * 2 - 1
103
+ input_im = transforms.functional.resize(input_im, [h, w])
104
+
105
+ sampler = DDIMSampler(model)
106
+
107
+ x_samples_ddim = sample_model(input_im, model, sampler, precision, h, w,\
108
+ ddim_steps, n_samples, scale, ddim_eta, x, y, z)
109
+ output_ims = []
110
+ for x_sample in x_samples_ddim:
111
+ x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
112
+ output_ims.append(Image.fromarray(x_sample.astype(np.uint8)))
113
+ return output_ims
114
+
115
+
116
+ description = \
117
+ """Generate variations on an input image using a fine-tuned version of Stable Diffision.
118
+ Trained by [Justin Pinkney](https://www.justinpinkney.com) ([@Buntworthy](https://twitter.com/Buntworthy)) at [Lambda](https://lambdalabs.com/)
119
+ __Get the [code](https://github.com/justinpinkney/stable-diffusion) and [model](https://huggingface.co/lambdalabs/stable-diffusion-image-conditioned).__
120
+ ![](https://raw.githubusercontent.com/justinpinkney/stable-diffusion/main/assets/im-vars-thin.jpg)
121
+ """
122
+
123
+ article = \
124
+ """
125
+ ## How does this work?
126
+ The normal Stable Diffusion model is trained to be conditioned on text input. This version has had the original text encoder (from CLIP) removed, and replaced with
127
+ the CLIP _image_ encoder instead. So instead of generating images based a text input, images are generated to match CLIP's embedding of the image.
128
+ This creates images which have the same rough style and content, but different details, in particular the composition is generally quite different.
129
+ This is a totally different approach to the img2img script of the original Stable Diffusion and gives very different results.
130
+ The model was fine tuned on the [LAION aethetics v2 6+ dataset](https://laion.ai/blog/laion-aesthetics/) to accept the new conditioning.
131
+ Training was done on 4xA6000 GPUs on [Lambda GPU Cloud](https://lambdalabs.com/service/gpu-cloud).
132
+ More details on the method and training will come in a future blog post.
133
+ """
134
+
135
+
136
+ def run_demo(
137
+ device_idx=0,
138
+ ckpt="last.ckpt",
139
+ config="configs/sd-objaverse-finetune-c_concat-256.yaml",
140
+ ):
141
+
142
+ device = f"cuda:{device_idx}"
143
+ config = OmegaConf.load(config)
144
+ model = load_model_from_config(config, ckpt, device=device)
145
+
146
+ inputs = [
147
+ gr.Image(type="pil", image_mode="RGBA"), # shape=[512, 512]
148
+ gr.Number(label="polar (between axis z+)"),
149
+ gr.Number(label="azimuth (between axis x+)"),
150
+ gr.Number(label="z (distance from center)"),
151
+ gr.Slider(0, 100, value=3, step=1, label="cfg scale"),
152
+ gr.Slider(1, 8, value=4, step=1, label="Number images"),
153
+ gr.Slider(5, 200, value=100, step=5, label="steps"),
154
+ gr.Checkbox(True, label="image preprocess (background removal and recenter)"),
155
+ ]
156
+ output = gr.Gallery(label="Generated variations")
157
+ output.style(grid=2)
158
+
159
+ fn_with_model = partial(main, model, device)
160
+ fn_with_model.__name__ = "fn_with_model"
161
+
162
+ examples = [
163
+ # ["assets/zero-shot/bear.png", 0, 0, 0, 3, 4, 100],
164
+ # ["assets/zero-shot/car.png", 0, 0, 0, 3, 4, 100],
165
+ # ["assets/zero-shot/elephant.png", 0, 0, 0, 3, 4, 100],
166
+ # ["assets/zero-shot/pikachu.png", 0, 0, 0, 3, 4, 100],
167
+ # ["assets/zero-shot/spyro.png", 0, 0, 0, 3, 4, 100],
168
+ # ["assets/zero-shot/taxi.png", 0, 0, 0, 3, 4, 100],
169
+ ]
170
+
171
+ demo = gr.Interface(
172
+ fn=fn_with_model,
173
+ title="Stable Diffusion Novel View Synthesis (Image)",
174
+ # description=description,
175
+ # article=article,
176
+ inputs=inputs,
177
+ outputs=output,
178
+ examples=examples,
179
+ allow_flagging="never",
180
+ )
181
+ demo.launch(enable_queue=True, share=True)
182
+
183
+ if __name__ == "__main__":
184
+ fire.Fire(run_demo)
instructions.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Tips if not getting expected results :detective:
2
+ 1. If you are trying out images of humans, especially faces, note that it's unfortunately not the intended use cases. We would encourage to try out images of everyday objects or even artworks.
3
+ 2. If some part of the object is missing, check the interactive angle visualization pane (top right) where you can find a panel of the actual input image to the model after preprocessing steps and see if the segmented image contains the entire object you are trying to visualize.
4
+ 3. The model is probabilistic, therefore, if the number of samples is selected to be bigger than 1 and results look different, that's expected as the model tries to predict a diverse set of possibilities given the input image and the specified camera viewpoint.
5
+ 4. Under "advanced options", you can tune two parameters as you can typically find in other stable diffusion demos as well:
6
+ - Diffusion Guidance Scale defines how much you want the model to respect the input information (image + angles). Higher scale typically leads to less diversity and higher image distortion.
7
+ - Number of diffusion inference steps controls the number of diffusion steps is applied to generate each image. Usually the higher the better with a diminishing return.
8
+
9
+ Have fun!
10
+
11
+ A model card can be found here: uses.md
ldm/data/__init__.py ADDED
File without changes
ldm/data/base.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from abc import abstractmethod
4
+ from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
5
+
6
+
7
+ class Txt2ImgIterableBaseDataset(IterableDataset):
8
+ '''
9
+ Define an interface to make the IterableDatasets for text2img data chainable
10
+ '''
11
+ def __init__(self, num_records=0, valid_ids=None, size=256):
12
+ super().__init__()
13
+ self.num_records = num_records
14
+ self.valid_ids = valid_ids
15
+ self.sample_ids = valid_ids
16
+ self.size = size
17
+
18
+ print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
19
+
20
+ def __len__(self):
21
+ return self.num_records
22
+
23
+ @abstractmethod
24
+ def __iter__(self):
25
+ pass
26
+
27
+
28
+ class PRNGMixin(object):
29
+ """
30
+ Adds a prng property which is a numpy RandomState which gets
31
+ reinitialized whenever the pid changes to avoid synchronized sampling
32
+ behavior when used in conjunction with multiprocessing.
33
+ """
34
+ @property
35
+ def prng(self):
36
+ currentpid = os.getpid()
37
+ if getattr(self, "_initpid", None) != currentpid:
38
+ self._initpid = currentpid
39
+ self._prng = np.random.RandomState()
40
+ return self._prng
ldm/data/coco.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import albumentations
4
+ import numpy as np
5
+ from PIL import Image
6
+ from tqdm import tqdm
7
+ from torch.utils.data import Dataset
8
+ from abc import abstractmethod
9
+
10
+
11
+ class CocoBase(Dataset):
12
+ """needed for (image, caption, segmentation) pairs"""
13
+ def __init__(self, size=None, dataroot="", datajson="", onehot_segmentation=False, use_stuffthing=False,
14
+ crop_size=None, force_no_crop=False, given_files=None, use_segmentation=True,crop_type=None):
15
+ self.split = self.get_split()
16
+ self.size = size
17
+ if crop_size is None:
18
+ self.crop_size = size
19
+ else:
20
+ self.crop_size = crop_size
21
+
22
+ assert crop_type in [None, 'random', 'center']
23
+ self.crop_type = crop_type
24
+ self.use_segmenation = use_segmentation
25
+ self.onehot = onehot_segmentation # return segmentation as rgb or one hot
26
+ self.stuffthing = use_stuffthing # include thing in segmentation
27
+ if self.onehot and not self.stuffthing:
28
+ raise NotImplemented("One hot mode is only supported for the "
29
+ "stuffthings version because labels are stored "
30
+ "a bit different.")
31
+
32
+ data_json = datajson
33
+ with open(data_json) as json_file:
34
+ self.json_data = json.load(json_file)
35
+ self.img_id_to_captions = dict()
36
+ self.img_id_to_filepath = dict()
37
+ self.img_id_to_segmentation_filepath = dict()
38
+
39
+ assert data_json.split("/")[-1] in [f"captions_train{self.year()}.json",
40
+ f"captions_val{self.year()}.json"]
41
+ # TODO currently hardcoded paths, would be better to follow logic in
42
+ # cocstuff pixelmaps
43
+ if self.use_segmenation:
44
+ if self.stuffthing:
45
+ self.segmentation_prefix = (
46
+ f"data/cocostuffthings/val{self.year()}" if
47
+ data_json.endswith(f"captions_val{self.year()}.json") else
48
+ f"data/cocostuffthings/train{self.year()}")
49
+ else:
50
+ self.segmentation_prefix = (
51
+ f"data/coco/annotations/stuff_val{self.year()}_pixelmaps" if
52
+ data_json.endswith(f"captions_val{self.year()}.json") else
53
+ f"data/coco/annotations/stuff_train{self.year()}_pixelmaps")
54
+
55
+ imagedirs = self.json_data["images"]
56
+ self.labels = {"image_ids": list()}
57
+ for imgdir in tqdm(imagedirs, desc="ImgToPath"):
58
+ self.img_id_to_filepath[imgdir["id"]] = os.path.join(dataroot, imgdir["file_name"])
59
+ self.img_id_to_captions[imgdir["id"]] = list()
60
+ pngfilename = imgdir["file_name"].replace("jpg", "png")
61
+ if self.use_segmenation:
62
+ self.img_id_to_segmentation_filepath[imgdir["id"]] = os.path.join(
63
+ self.segmentation_prefix, pngfilename)
64
+ if given_files is not None:
65
+ if pngfilename in given_files:
66
+ self.labels["image_ids"].append(imgdir["id"])
67
+ else:
68
+ self.labels["image_ids"].append(imgdir["id"])
69
+
70
+ capdirs = self.json_data["annotations"]
71
+ for capdir in tqdm(capdirs, desc="ImgToCaptions"):
72
+ # there are in average 5 captions per image
73
+ #self.img_id_to_captions[capdir["image_id"]].append(np.array([capdir["caption"]]))
74
+ self.img_id_to_captions[capdir["image_id"]].append(capdir["caption"])
75
+
76
+ self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
77
+ if self.split=="validation":
78
+ self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
79
+ else:
80
+ # default option for train is random crop
81
+ if self.crop_type in [None, 'random']:
82
+ self.cropper = albumentations.RandomCrop(height=self.crop_size, width=self.crop_size)
83
+ else:
84
+ self.cropper = albumentations.CenterCrop(height=self.crop_size, width=self.crop_size)
85
+ self.preprocessor = albumentations.Compose(
86
+ [self.rescaler, self.cropper],
87
+ additional_targets={"segmentation": "image"})
88
+ if force_no_crop:
89
+ self.rescaler = albumentations.Resize(height=self.size, width=self.size)
90
+ self.preprocessor = albumentations.Compose(
91
+ [self.rescaler],
92
+ additional_targets={"segmentation": "image"})
93
+
94
+ @abstractmethod
95
+ def year(self):
96
+ raise NotImplementedError()
97
+
98
+ def __len__(self):
99
+ return len(self.labels["image_ids"])
100
+
101
+ def preprocess_image(self, image_path, segmentation_path=None):
102
+ image = Image.open(image_path)
103
+ if not image.mode == "RGB":
104
+ image = image.convert("RGB")
105
+ image = np.array(image).astype(np.uint8)
106
+ if segmentation_path:
107
+ segmentation = Image.open(segmentation_path)
108
+ if not self.onehot and not segmentation.mode == "RGB":
109
+ segmentation = segmentation.convert("RGB")
110
+ segmentation = np.array(segmentation).astype(np.uint8)
111
+ if self.onehot:
112
+ assert self.stuffthing
113
+ # stored in caffe format: unlabeled==255. stuff and thing from
114
+ # 0-181. to be compatible with the labels in
115
+ # https://github.com/nightrome/cocostuff/blob/master/labels.txt
116
+ # we shift stuffthing one to the right and put unlabeled in zero
117
+ # as long as segmentation is uint8 shifting to right handles the
118
+ # latter too
119
+ assert segmentation.dtype == np.uint8
120
+ segmentation = segmentation + 1
121
+
122
+ processed = self.preprocessor(image=image, segmentation=segmentation)
123
+
124
+ image, segmentation = processed["image"], processed["segmentation"]
125
+ else:
126
+ image = self.preprocessor(image=image,)['image']
127
+
128
+ image = (image / 127.5 - 1.0).astype(np.float32)
129
+ if segmentation_path:
130
+ if self.onehot:
131
+ assert segmentation.dtype == np.uint8
132
+ # make it one hot
133
+ n_labels = 183
134
+ flatseg = np.ravel(segmentation)
135
+ onehot = np.zeros((flatseg.size, n_labels), dtype=np.bool)
136
+ onehot[np.arange(flatseg.size), flatseg] = True
137
+ onehot = onehot.reshape(segmentation.shape + (n_labels,)).astype(int)
138
+ segmentation = onehot
139
+ else:
140
+ segmentation = (segmentation / 127.5 - 1.0).astype(np.float32)
141
+ return image, segmentation
142
+ else:
143
+ return image
144
+
145
+ def __getitem__(self, i):
146
+ img_path = self.img_id_to_filepath[self.labels["image_ids"][i]]
147
+ if self.use_segmenation:
148
+ seg_path = self.img_id_to_segmentation_filepath[self.labels["image_ids"][i]]
149
+ image, segmentation = self.preprocess_image(img_path, seg_path)
150
+ else:
151
+ image = self.preprocess_image(img_path)
152
+ captions = self.img_id_to_captions[self.labels["image_ids"][i]]
153
+ # randomly draw one of all available captions per image
154
+ caption = captions[np.random.randint(0, len(captions))]
155
+ example = {"image": image,
156
+ #"caption": [str(caption[0])],
157
+ "caption": caption,
158
+ "img_path": img_path,
159
+ "filename_": img_path.split(os.sep)[-1]
160
+ }
161
+ if self.use_segmenation:
162
+ example.update({"seg_path": seg_path, 'segmentation': segmentation})
163
+ return example
164
+
165
+
166
+ class CocoImagesAndCaptionsTrain2017(CocoBase):
167
+ """returns a pair of (image, caption)"""
168
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,):
169
+ super().__init__(size=size,
170
+ dataroot="data/coco/train2017",
171
+ datajson="data/coco/annotations/captions_train2017.json",
172
+ onehot_segmentation=onehot_segmentation,
173
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop)
174
+
175
+ def get_split(self):
176
+ return "train"
177
+
178
+ def year(self):
179
+ return '2017'
180
+
181
+
182
+ class CocoImagesAndCaptionsValidation2017(CocoBase):
183
+ """returns a pair of (image, caption)"""
184
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,
185
+ given_files=None):
186
+ super().__init__(size=size,
187
+ dataroot="data/coco/val2017",
188
+ datajson="data/coco/annotations/captions_val2017.json",
189
+ onehot_segmentation=onehot_segmentation,
190
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop,
191
+ given_files=given_files)
192
+
193
+ def get_split(self):
194
+ return "validation"
195
+
196
+ def year(self):
197
+ return '2017'
198
+
199
+
200
+
201
+ class CocoImagesAndCaptionsTrain2014(CocoBase):
202
+ """returns a pair of (image, caption)"""
203
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,crop_type='random'):
204
+ super().__init__(size=size,
205
+ dataroot="data/coco/train2014",
206
+ datajson="data/coco/annotations2014/annotations/captions_train2014.json",
207
+ onehot_segmentation=onehot_segmentation,
208
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop,
209
+ use_segmentation=False,
210
+ crop_type=crop_type)
211
+
212
+ def get_split(self):
213
+ return "train"
214
+
215
+ def year(self):
216
+ return '2014'
217
+
218
+ class CocoImagesAndCaptionsValidation2014(CocoBase):
219
+ """returns a pair of (image, caption)"""
220
+ def __init__(self, size, onehot_segmentation=False, use_stuffthing=False, crop_size=None, force_no_crop=False,
221
+ given_files=None,crop_type='center',**kwargs):
222
+ super().__init__(size=size,
223
+ dataroot="data/coco/val2014",
224
+ datajson="data/coco/annotations2014/annotations/captions_val2014.json",
225
+ onehot_segmentation=onehot_segmentation,
226
+ use_stuffthing=use_stuffthing, crop_size=crop_size, force_no_crop=force_no_crop,
227
+ given_files=given_files,
228
+ use_segmentation=False,
229
+ crop_type=crop_type)
230
+
231
+ def get_split(self):
232
+ return "validation"
233
+
234
+ def year(self):
235
+ return '2014'
236
+
237
+ if __name__ == '__main__':
238
+ with open("data/coco/annotations2014/annotations/captions_val2014.json", "r") as json_file:
239
+ json_data = json.load(json_file)
240
+ capdirs = json_data["annotations"]
241
+ import pudb; pudb.set_trace()
242
+ #d2 = CocoImagesAndCaptionsTrain2014(size=256)
243
+ d2 = CocoImagesAndCaptionsValidation2014(size=256)
244
+ print("constructed dataset.")
245
+ print(f"length of {d2.__class__.__name__}: {len(d2)}")
246
+
247
+ ex2 = d2[0]
248
+ # ex3 = d3[0]
249
+ # print(ex1["image"].shape)
250
+ print(ex2["image"].shape)
251
+ # print(ex3["image"].shape)
252
+ # print(ex1["segmentation"].shape)
253
+ print(ex2["caption"].__class__.__name__)
ldm/data/dummy.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import random
3
+ import string
4
+ from torch.utils.data import Dataset, Subset
5
+
6
+ class DummyData(Dataset):
7
+ def __init__(self, length, size):
8
+ self.length = length
9
+ self.size = size
10
+
11
+ def __len__(self):
12
+ return self.length
13
+
14
+ def __getitem__(self, i):
15
+ x = np.random.randn(*self.size)
16
+ letters = string.ascii_lowercase
17
+ y = ''.join(random.choice(string.ascii_lowercase) for i in range(10))
18
+ return {"jpg": x, "txt": y}
19
+
20
+
21
+ class DummyDataWithEmbeddings(Dataset):
22
+ def __init__(self, length, size, emb_size):
23
+ self.length = length
24
+ self.size = size
25
+ self.emb_size = emb_size
26
+
27
+ def __len__(self):
28
+ return self.length
29
+
30
+ def __getitem__(self, i):
31
+ x = np.random.randn(*self.size)
32
+ y = np.random.randn(*self.emb_size).astype(np.float32)
33
+ return {"jpg": x, "txt": y}
34
+
ldm/data/imagenet.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, yaml, pickle, shutil, tarfile, glob
2
+ import cv2
3
+ import albumentations
4
+ import PIL
5
+ import numpy as np
6
+ import torchvision.transforms.functional as TF
7
+ from omegaconf import OmegaConf
8
+ from functools import partial
9
+ from PIL import Image
10
+ from tqdm import tqdm
11
+ from torch.utils.data import Dataset, Subset
12
+
13
+ import taming.data.utils as tdu
14
+ from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
15
+ from taming.data.imagenet import ImagePaths
16
+
17
+ from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
18
+
19
+
20
+ def synset2idx(path_to_yaml="data/index_synset.yaml"):
21
+ with open(path_to_yaml) as f:
22
+ di2s = yaml.load(f)
23
+ return dict((v,k) for k,v in di2s.items())
24
+
25
+
26
+ class ImageNetBase(Dataset):
27
+ def __init__(self, config=None):
28
+ self.config = config or OmegaConf.create()
29
+ if not type(self.config)==dict:
30
+ self.config = OmegaConf.to_container(self.config)
31
+ self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
32
+ self.process_images = True # if False we skip loading & processing images and self.data contains filepaths
33
+ self._prepare()
34
+ self._prepare_synset_to_human()
35
+ self._prepare_idx_to_synset()
36
+ self._prepare_human_to_integer_label()
37
+ self._load()
38
+
39
+ def __len__(self):
40
+ return len(self.data)
41
+
42
+ def __getitem__(self, i):
43
+ return self.data[i]
44
+
45
+ def _prepare(self):
46
+ raise NotImplementedError()
47
+
48
+ def _filter_relpaths(self, relpaths):
49
+ ignore = set([
50
+ "n06596364_9591.JPEG",
51
+ ])
52
+ relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
53
+ if "sub_indices" in self.config:
54
+ indices = str_to_indices(self.config["sub_indices"])
55
+ synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
56
+ self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
57
+ files = []
58
+ for rpath in relpaths:
59
+ syn = rpath.split("/")[0]
60
+ if syn in synsets:
61
+ files.append(rpath)
62
+ return files
63
+ else:
64
+ return relpaths
65
+
66
+ def _prepare_synset_to_human(self):
67
+ SIZE = 2655750
68
+ URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
69
+ self.human_dict = os.path.join(self.root, "synset_human.txt")
70
+ if (not os.path.exists(self.human_dict) or
71
+ not os.path.getsize(self.human_dict)==SIZE):
72
+ download(URL, self.human_dict)
73
+
74
+ def _prepare_idx_to_synset(self):
75
+ URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
76
+ self.idx2syn = os.path.join(self.root, "index_synset.yaml")
77
+ if (not os.path.exists(self.idx2syn)):
78
+ download(URL, self.idx2syn)
79
+
80
+ def _prepare_human_to_integer_label(self):
81
+ URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
82
+ self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
83
+ if (not os.path.exists(self.human2integer)):
84
+ download(URL, self.human2integer)
85
+ with open(self.human2integer, "r") as f:
86
+ lines = f.read().splitlines()
87
+ assert len(lines) == 1000
88
+ self.human2integer_dict = dict()
89
+ for line in lines:
90
+ value, key = line.split(":")
91
+ self.human2integer_dict[key] = int(value)
92
+
93
+ def _load(self):
94
+ with open(self.txt_filelist, "r") as f:
95
+ self.relpaths = f.read().splitlines()
96
+ l1 = len(self.relpaths)
97
+ self.relpaths = self._filter_relpaths(self.relpaths)
98
+ print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
99
+
100
+ self.synsets = [p.split("/")[0] for p in self.relpaths]
101
+ self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
102
+
103
+ unique_synsets = np.unique(self.synsets)
104
+ class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
105
+ if not self.keep_orig_class_label:
106
+ self.class_labels = [class_dict[s] for s in self.synsets]
107
+ else:
108
+ self.class_labels = [self.synset2idx[s] for s in self.synsets]
109
+
110
+ with open(self.human_dict, "r") as f:
111
+ human_dict = f.read().splitlines()
112
+ human_dict = dict(line.split(maxsplit=1) for line in human_dict)
113
+
114
+ self.human_labels = [human_dict[s] for s in self.synsets]
115
+
116
+ labels = {
117
+ "relpath": np.array(self.relpaths),
118
+ "synsets": np.array(self.synsets),
119
+ "class_label": np.array(self.class_labels),
120
+ "human_label": np.array(self.human_labels),
121
+ }
122
+
123
+ if self.process_images:
124
+ self.size = retrieve(self.config, "size", default=256)
125
+ self.data = ImagePaths(self.abspaths,
126
+ labels=labels,
127
+ size=self.size,
128
+ random_crop=self.random_crop,
129
+ )
130
+ else:
131
+ self.data = self.abspaths
132
+
133
+
134
+ class ImageNetTrain(ImageNetBase):
135
+ NAME = "ILSVRC2012_train"
136
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
137
+ AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
138
+ FILES = [
139
+ "ILSVRC2012_img_train.tar",
140
+ ]
141
+ SIZES = [
142
+ 147897477120,
143
+ ]
144
+
145
+ def __init__(self, process_images=True, data_root=None, **kwargs):
146
+ self.process_images = process_images
147
+ self.data_root = data_root
148
+ super().__init__(**kwargs)
149
+
150
+ def _prepare(self):
151
+ if self.data_root:
152
+ self.root = os.path.join(self.data_root, self.NAME)
153
+ else:
154
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
155
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
156
+
157
+ self.datadir = os.path.join(self.root, "data")
158
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
159
+ self.expected_length = 1281167
160
+ self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
161
+ default=True)
162
+ if not tdu.is_prepared(self.root):
163
+ # prep
164
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
165
+
166
+ datadir = self.datadir
167
+ if not os.path.exists(datadir):
168
+ path = os.path.join(self.root, self.FILES[0])
169
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
170
+ import academictorrents as at
171
+ atpath = at.get(self.AT_HASH, datastore=self.root)
172
+ assert atpath == path
173
+
174
+ print("Extracting {} to {}".format(path, datadir))
175
+ os.makedirs(datadir, exist_ok=True)
176
+ with tarfile.open(path, "r:") as tar:
177
+ tar.extractall(path=datadir)
178
+
179
+ print("Extracting sub-tars.")
180
+ subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
181
+ for subpath in tqdm(subpaths):
182
+ subdir = subpath[:-len(".tar")]
183
+ os.makedirs(subdir, exist_ok=True)
184
+ with tarfile.open(subpath, "r:") as tar:
185
+ tar.extractall(path=subdir)
186
+
187
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
188
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
189
+ filelist = sorted(filelist)
190
+ filelist = "\n".join(filelist)+"\n"
191
+ with open(self.txt_filelist, "w") as f:
192
+ f.write(filelist)
193
+
194
+ tdu.mark_prepared(self.root)
195
+
196
+
197
+ class ImageNetValidation(ImageNetBase):
198
+ NAME = "ILSVRC2012_validation"
199
+ URL = "http://www.image-net.org/challenges/LSVRC/2012/"
200
+ AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
201
+ VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
202
+ FILES = [
203
+ "ILSVRC2012_img_val.tar",
204
+ "validation_synset.txt",
205
+ ]
206
+ SIZES = [
207
+ 6744924160,
208
+ 1950000,
209
+ ]
210
+
211
+ def __init__(self, process_images=True, data_root=None, **kwargs):
212
+ self.data_root = data_root
213
+ self.process_images = process_images
214
+ super().__init__(**kwargs)
215
+
216
+ def _prepare(self):
217
+ if self.data_root:
218
+ self.root = os.path.join(self.data_root, self.NAME)
219
+ else:
220
+ cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
221
+ self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
222
+ self.datadir = os.path.join(self.root, "data")
223
+ self.txt_filelist = os.path.join(self.root, "filelist.txt")
224
+ self.expected_length = 50000
225
+ self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
226
+ default=False)
227
+ if not tdu.is_prepared(self.root):
228
+ # prep
229
+ print("Preparing dataset {} in {}".format(self.NAME, self.root))
230
+
231
+ datadir = self.datadir
232
+ if not os.path.exists(datadir):
233
+ path = os.path.join(self.root, self.FILES[0])
234
+ if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
235
+ import academictorrents as at
236
+ atpath = at.get(self.AT_HASH, datastore=self.root)
237
+ assert atpath == path
238
+
239
+ print("Extracting {} to {}".format(path, datadir))
240
+ os.makedirs(datadir, exist_ok=True)
241
+ with tarfile.open(path, "r:") as tar:
242
+ tar.extractall(path=datadir)
243
+
244
+ vspath = os.path.join(self.root, self.FILES[1])
245
+ if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
246
+ download(self.VS_URL, vspath)
247
+
248
+ with open(vspath, "r") as f:
249
+ synset_dict = f.read().splitlines()
250
+ synset_dict = dict(line.split() for line in synset_dict)
251
+
252
+ print("Reorganizing into synset folders")
253
+ synsets = np.unique(list(synset_dict.values()))
254
+ for s in synsets:
255
+ os.makedirs(os.path.join(datadir, s), exist_ok=True)
256
+ for k, v in synset_dict.items():
257
+ src = os.path.join(datadir, k)
258
+ dst = os.path.join(datadir, v)
259
+ shutil.move(src, dst)
260
+
261
+ filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
262
+ filelist = [os.path.relpath(p, start=datadir) for p in filelist]
263
+ filelist = sorted(filelist)
264
+ filelist = "\n".join(filelist)+"\n"
265
+ with open(self.txt_filelist, "w") as f:
266
+ f.write(filelist)
267
+
268
+ tdu.mark_prepared(self.root)
269
+
270
+
271
+
272
+ class ImageNetSR(Dataset):
273
+ def __init__(self, size=None,
274
+ degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
275
+ random_crop=True):
276
+ """
277
+ Imagenet Superresolution Dataloader
278
+ Performs following ops in order:
279
+ 1. crops a crop of size s from image either as random or center crop
280
+ 2. resizes crop to size with cv2.area_interpolation
281
+ 3. degrades resized crop with degradation_fn
282
+
283
+ :param size: resizing to size after cropping
284
+ :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
285
+ :param downscale_f: Low Resolution Downsample factor
286
+ :param min_crop_f: determines crop size s,
287
+ where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
288
+ :param max_crop_f: ""
289
+ :param data_root:
290
+ :param random_crop:
291
+ """
292
+ self.base = self.get_base()
293
+ assert size
294
+ assert (size / downscale_f).is_integer()
295
+ self.size = size
296
+ self.LR_size = int(size / downscale_f)
297
+ self.min_crop_f = min_crop_f
298
+ self.max_crop_f = max_crop_f
299
+ assert(max_crop_f <= 1.)
300
+ self.center_crop = not random_crop
301
+
302
+ self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
303
+
304
+ self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
305
+
306
+ if degradation == "bsrgan":
307
+ self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
308
+
309
+ elif degradation == "bsrgan_light":
310
+ self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
311
+
312
+ else:
313
+ interpolation_fn = {
314
+ "cv_nearest": cv2.INTER_NEAREST,
315
+ "cv_bilinear": cv2.INTER_LINEAR,
316
+ "cv_bicubic": cv2.INTER_CUBIC,
317
+ "cv_area": cv2.INTER_AREA,
318
+ "cv_lanczos": cv2.INTER_LANCZOS4,
319
+ "pil_nearest": PIL.Image.NEAREST,
320
+ "pil_bilinear": PIL.Image.BILINEAR,
321
+ "pil_bicubic": PIL.Image.BICUBIC,
322
+ "pil_box": PIL.Image.BOX,
323
+ "pil_hamming": PIL.Image.HAMMING,
324
+ "pil_lanczos": PIL.Image.LANCZOS,
325
+ }[degradation]
326
+
327
+ self.pil_interpolation = degradation.startswith("pil_")
328
+
329
+ if self.pil_interpolation:
330
+ self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
331
+
332
+ else:
333
+ self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
334
+ interpolation=interpolation_fn)
335
+
336
+ def __len__(self):
337
+ return len(self.base)
338
+
339
+ def __getitem__(self, i):
340
+ example = self.base[i]
341
+ image = Image.open(example["file_path_"])
342
+
343
+ if not image.mode == "RGB":
344
+ image = image.convert("RGB")
345
+
346
+ image = np.array(image).astype(np.uint8)
347
+
348
+ min_side_len = min(image.shape[:2])
349
+ crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
350
+ crop_side_len = int(crop_side_len)
351
+
352
+ if self.center_crop:
353
+ self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
354
+
355
+ else:
356
+ self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
357
+
358
+ image = self.cropper(image=image)["image"]
359
+ image = self.image_rescaler(image=image)["image"]
360
+
361
+ if self.pil_interpolation:
362
+ image_pil = PIL.Image.fromarray(image)
363
+ LR_image = self.degradation_process(image_pil)
364
+ LR_image = np.array(LR_image).astype(np.uint8)
365
+
366
+ else:
367
+ LR_image = self.degradation_process(image=image)["image"]
368
+
369
+ example["image"] = (image/127.5 - 1.0).astype(np.float32)
370
+ example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
371
+ example["caption"] = example["human_label"] # dummy caption
372
+ return example
373
+
374
+
375
+ class ImageNetSRTrain(ImageNetSR):
376
+ def __init__(self, **kwargs):
377
+ super().__init__(**kwargs)
378
+
379
+ def get_base(self):
380
+ with open("data/imagenet_train_hr_indices.p", "rb") as f:
381
+ indices = pickle.load(f)
382
+ dset = ImageNetTrain(process_images=False,)
383
+ return Subset(dset, indices)
384
+
385
+
386
+ class ImageNetSRValidation(ImageNetSR):
387
+ def __init__(self, **kwargs):
388
+ super().__init__(**kwargs)
389
+
390
+ def get_base(self):
391
+ with open("data/imagenet_val_hr_indices.p", "rb") as f:
392
+ indices = pickle.load(f)
393
+ dset = ImageNetValidation(process_images=False,)
394
+ return Subset(dset, indices)
ldm/data/inpainting/__init__.py ADDED
File without changes
ldm/data/inpainting/synthetic_mask.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageDraw
2
+ import numpy as np
3
+
4
+ settings = {
5
+ "256narrow": {
6
+ "p_irr": 1,
7
+ "min_n_irr": 4,
8
+ "max_n_irr": 50,
9
+ "max_l_irr": 40,
10
+ "max_w_irr": 10,
11
+ "min_n_box": None,
12
+ "max_n_box": None,
13
+ "min_s_box": None,
14
+ "max_s_box": None,
15
+ "marg": None,
16
+ },
17
+ "256train": {
18
+ "p_irr": 0.5,
19
+ "min_n_irr": 1,
20
+ "max_n_irr": 5,
21
+ "max_l_irr": 200,
22
+ "max_w_irr": 100,
23
+ "min_n_box": 1,
24
+ "max_n_box": 4,
25
+ "min_s_box": 30,
26
+ "max_s_box": 150,
27
+ "marg": 10,
28
+ },
29
+ "512train": { # TODO: experimental
30
+ "p_irr": 0.5,
31
+ "min_n_irr": 1,
32
+ "max_n_irr": 5,
33
+ "max_l_irr": 450,
34
+ "max_w_irr": 250,
35
+ "min_n_box": 1,
36
+ "max_n_box": 4,
37
+ "min_s_box": 30,
38
+ "max_s_box": 300,
39
+ "marg": 10,
40
+ },
41
+ "512train-large": { # TODO: experimental
42
+ "p_irr": 0.5,
43
+ "min_n_irr": 1,
44
+ "max_n_irr": 5,
45
+ "max_l_irr": 450,
46
+ "max_w_irr": 400,
47
+ "min_n_box": 1,
48
+ "max_n_box": 4,
49
+ "min_s_box": 75,
50
+ "max_s_box": 450,
51
+ "marg": 10,
52
+ },
53
+ }
54
+
55
+
56
+ def gen_segment_mask(mask, start, end, brush_width):
57
+ mask = mask > 0
58
+ mask = (255 * mask).astype(np.uint8)
59
+ mask = Image.fromarray(mask)
60
+ draw = ImageDraw.Draw(mask)
61
+ draw.line([start, end], fill=255, width=brush_width, joint="curve")
62
+ mask = np.array(mask) / 255
63
+ return mask
64
+
65
+
66
+ def gen_box_mask(mask, masked):
67
+ x_0, y_0, w, h = masked
68
+ mask[y_0:y_0 + h, x_0:x_0 + w] = 1
69
+ return mask
70
+
71
+
72
+ def gen_round_mask(mask, masked, radius):
73
+ x_0, y_0, w, h = masked
74
+ xy = [(x_0, y_0), (x_0 + w, y_0 + w)]
75
+
76
+ mask = mask > 0
77
+ mask = (255 * mask).astype(np.uint8)
78
+ mask = Image.fromarray(mask)
79
+ draw = ImageDraw.Draw(mask)
80
+ draw.rounded_rectangle(xy, radius=radius, fill=255)
81
+ mask = np.array(mask) / 255
82
+ return mask
83
+
84
+
85
+ def gen_large_mask(prng, img_h, img_w,
86
+ marg, p_irr, min_n_irr, max_n_irr, max_l_irr, max_w_irr,
87
+ min_n_box, max_n_box, min_s_box, max_s_box):
88
+ """
89
+ img_h: int, an image height
90
+ img_w: int, an image width
91
+ marg: int, a margin for a box starting coordinate
92
+ p_irr: float, 0 <= p_irr <= 1, a probability of a polygonal chain mask
93
+
94
+ min_n_irr: int, min number of segments
95
+ max_n_irr: int, max number of segments
96
+ max_l_irr: max length of a segment in polygonal chain
97
+ max_w_irr: max width of a segment in polygonal chain
98
+
99
+ min_n_box: int, min bound for the number of box primitives
100
+ max_n_box: int, max bound for the number of box primitives
101
+ min_s_box: int, min length of a box side
102
+ max_s_box: int, max length of a box side
103
+ """
104
+
105
+ mask = np.zeros((img_h, img_w))
106
+ uniform = prng.randint
107
+
108
+ if np.random.uniform(0, 1) < p_irr: # generate polygonal chain
109
+ n = uniform(min_n_irr, max_n_irr) # sample number of segments
110
+
111
+ for _ in range(n):
112
+ y = uniform(0, img_h) # sample a starting point
113
+ x = uniform(0, img_w)
114
+
115
+ a = uniform(0, 360) # sample angle
116
+ l = uniform(10, max_l_irr) # sample segment length
117
+ w = uniform(5, max_w_irr) # sample a segment width
118
+
119
+ # draw segment starting from (x,y) to (x_,y_) using brush of width w
120
+ x_ = x + l * np.sin(a)
121
+ y_ = y + l * np.cos(a)
122
+
123
+ mask = gen_segment_mask(mask, start=(x, y), end=(x_, y_), brush_width=w)
124
+ x, y = x_, y_
125
+ else: # generate Box masks
126
+ n = uniform(min_n_box, max_n_box) # sample number of rectangles
127
+
128
+ for _ in range(n):
129
+ h = uniform(min_s_box, max_s_box) # sample box shape
130
+ w = uniform(min_s_box, max_s_box)
131
+
132
+ x_0 = uniform(marg, img_w - marg - w) # sample upper-left coordinates of box
133
+ y_0 = uniform(marg, img_h - marg - h)
134
+
135
+ if np.random.uniform(0, 1) < 0.5:
136
+ mask = gen_box_mask(mask, masked=(x_0, y_0, w, h))
137
+ else:
138
+ r = uniform(0, 60) # sample radius
139
+ mask = gen_round_mask(mask, masked=(x_0, y_0, w, h), radius=r)
140
+ return mask
141
+
142
+
143
+ make_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256train"])
144
+ make_narrow_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256narrow"])
145
+ make_512_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train"])
146
+ make_512_lama_mask_large = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train-large"])
147
+
148
+
149
+ MASK_MODES = {
150
+ "256train": make_lama_mask,
151
+ "256narrow": make_narrow_lama_mask,
152
+ "512train": make_512_lama_mask,
153
+ "512train-large": make_512_lama_mask_large
154
+ }
155
+
156
+ if __name__ == "__main__":
157
+ import sys
158
+
159
+ out = sys.argv[1]
160
+
161
+ prng = np.random.RandomState(1)
162
+ kwargs = settings["256train"]
163
+ mask = gen_large_mask(prng, 256, 256, **kwargs)
164
+ mask = (255 * mask).astype(np.uint8)
165
+ mask = Image.fromarray(mask)
166
+ mask.save(out)
ldm/data/laion.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import webdataset as wds
2
+ import kornia
3
+ from PIL import Image
4
+ import io
5
+ import os
6
+ import torchvision
7
+ from PIL import Image
8
+ import glob
9
+ import random
10
+ import numpy as np
11
+ import pytorch_lightning as pl
12
+ from tqdm import tqdm
13
+ from omegaconf import OmegaConf
14
+ from einops import rearrange
15
+ import torch
16
+ from webdataset.handlers import warn_and_continue
17
+
18
+
19
+ from ldm.util import instantiate_from_config
20
+ from ldm.data.inpainting.synthetic_mask import gen_large_mask, MASK_MODES
21
+ from ldm.data.base import PRNGMixin
22
+
23
+
24
+ class DataWithWings(torch.utils.data.IterableDataset):
25
+ def __init__(self, min_size, transform=None, target_transform=None):
26
+ self.min_size = min_size
27
+ self.transform = transform if transform is not None else nn.Identity()
28
+ self.target_transform = target_transform if target_transform is not None else nn.Identity()
29
+ self.kv = OnDiskKV(file='/home/ubuntu/laion5B-watermark-safety-ordered', key_format='q', value_format='ee')
30
+ self.kv_aesthetic = OnDiskKV(file='/home/ubuntu/laion5B-aesthetic-tags-kv', key_format='q', value_format='e')
31
+ self.pwatermark_threshold = 0.8
32
+ self.punsafe_threshold = 0.5
33
+ self.aesthetic_threshold = 5.
34
+ self.total_samples = 0
35
+ self.samples = 0
36
+ location = 'pipe:aws s3 cp --quiet s3://s-datasets/laion5b/laion2B-data/{000000..231349}.tar -'
37
+
38
+ self.inner_dataset = wds.DataPipeline(
39
+ wds.ResampledShards(location),
40
+ wds.tarfile_to_samples(handler=wds.warn_and_continue),
41
+ wds.shuffle(1000, handler=wds.warn_and_continue),
42
+ wds.decode('pilrgb', handler=wds.warn_and_continue),
43
+ wds.map(self._add_tags, handler=wds.ignore_and_continue),
44
+ wds.select(self._filter_predicate),
45
+ wds.map_dict(jpg=self.transform, txt=self.target_transform, punsafe=self._punsafe_to_class, handler=wds.warn_and_continue),
46
+ wds.to_tuple('jpg', 'txt', 'punsafe', handler=wds.warn_and_continue),
47
+ )
48
+
49
+ @staticmethod
50
+ def _compute_hash(url, text):
51
+ if url is None:
52
+ url = ''
53
+ if text is None:
54
+ text = ''
55
+ total = (url + text).encode('utf-8')
56
+ return mmh3.hash64(total)[0]
57
+
58
+ def _add_tags(self, x):
59
+ hsh = self._compute_hash(x['json']['url'], x['txt'])
60
+ pwatermark, punsafe = self.kv[hsh]
61
+ aesthetic = self.kv_aesthetic[hsh][0]
62
+ return {**x, 'pwatermark': pwatermark, 'punsafe': punsafe, 'aesthetic': aesthetic}
63
+
64
+ def _punsafe_to_class(self, punsafe):
65
+ return torch.tensor(punsafe >= self.punsafe_threshold).long()
66
+
67
+ def _filter_predicate(self, x):
68
+ try:
69
+ return x['pwatermark'] < self.pwatermark_threshold and x['aesthetic'] >= self.aesthetic_threshold and x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size
70
+ except:
71
+ return False
72
+
73
+ def __iter__(self):
74
+ return iter(self.inner_dataset)
75
+
76
+
77
+ def dict_collation_fn(samples, combine_tensors=True, combine_scalars=True):
78
+ """Take a list of samples (as dictionary) and create a batch, preserving the keys.
79
+ If `tensors` is True, `ndarray` objects are combined into
80
+ tensor batches.
81
+ :param dict samples: list of samples
82
+ :param bool tensors: whether to turn lists of ndarrays into a single ndarray
83
+ :returns: single sample consisting of a batch
84
+ :rtype: dict
85
+ """
86
+ keys = set.intersection(*[set(sample.keys()) for sample in samples])
87
+ batched = {key: [] for key in keys}
88
+
89
+ for s in samples:
90
+ [batched[key].append(s[key]) for key in batched]
91
+
92
+ result = {}
93
+ for key in batched:
94
+ if isinstance(batched[key][0], (int, float)):
95
+ if combine_scalars:
96
+ result[key] = np.array(list(batched[key]))
97
+ elif isinstance(batched[key][0], torch.Tensor):
98
+ if combine_tensors:
99
+ result[key] = torch.stack(list(batched[key]))
100
+ elif isinstance(batched[key][0], np.ndarray):
101
+ if combine_tensors:
102
+ result[key] = np.array(list(batched[key]))
103
+ else:
104
+ result[key] = list(batched[key])
105
+ return result
106
+
107
+
108
+ class WebDataModuleFromConfig(pl.LightningDataModule):
109
+ def __init__(self, tar_base, batch_size, train=None, validation=None,
110
+ test=None, num_workers=4, multinode=True, min_size=None,
111
+ max_pwatermark=1.0,
112
+ **kwargs):
113
+ super().__init__(self)
114
+ print(f'Setting tar base to {tar_base}')
115
+ self.tar_base = tar_base
116
+ self.batch_size = batch_size
117
+ self.num_workers = num_workers
118
+ self.train = train
119
+ self.validation = validation
120
+ self.test = test
121
+ self.multinode = multinode
122
+ self.min_size = min_size # filter out very small images
123
+ self.max_pwatermark = max_pwatermark # filter out watermarked images
124
+
125
+ def make_loader(self, dataset_config, train=True):
126
+ if 'image_transforms' in dataset_config:
127
+ image_transforms = [instantiate_from_config(tt) for tt in dataset_config.image_transforms]
128
+ else:
129
+ image_transforms = []
130
+
131
+ image_transforms.extend([torchvision.transforms.ToTensor(),
132
+ torchvision.transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
133
+ image_transforms = torchvision.transforms.Compose(image_transforms)
134
+
135
+ if 'transforms' in dataset_config:
136
+ transforms_config = OmegaConf.to_container(dataset_config.transforms)
137
+ else:
138
+ transforms_config = dict()
139
+
140
+ transform_dict = {dkey: load_partial_from_config(transforms_config[dkey])
141
+ if transforms_config[dkey] != 'identity' else identity
142
+ for dkey in transforms_config}
143
+ img_key = dataset_config.get('image_key', 'jpeg')
144
+ transform_dict.update({img_key: image_transforms})
145
+
146
+ if 'postprocess' in dataset_config:
147
+ postprocess = instantiate_from_config(dataset_config['postprocess'])
148
+ else:
149
+ postprocess = None
150
+
151
+ shuffle = dataset_config.get('shuffle', 0)
152
+ shardshuffle = shuffle > 0
153
+
154
+ nodesplitter = wds.shardlists.split_by_node if self.multinode else wds.shardlists.single_node_only
155
+
156
+ if self.tar_base == "__improvedaesthetic__":
157
+ print("## Warning, loading the same improved aesthetic dataset "
158
+ "for all splits and ignoring shards parameter.")
159
+ tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{000000..060207}.tar -"
160
+ else:
161
+ tars = os.path.join(self.tar_base, dataset_config.shards)
162
+
163
+ dset = wds.WebDataset(
164
+ tars,
165
+ nodesplitter=nodesplitter,
166
+ shardshuffle=shardshuffle,
167
+ handler=wds.warn_and_continue).repeat().shuffle(shuffle)
168
+ print(f'Loading webdataset with {len(dset.pipeline[0].urls)} shards.')
169
+
170
+ dset = (dset
171
+ .select(self.filter_keys)
172
+ .decode('pil', handler=wds.warn_and_continue)
173
+ .select(self.filter_size)
174
+ .map_dict(**transform_dict, handler=wds.warn_and_continue)
175
+ )
176
+ if postprocess is not None:
177
+ dset = dset.map(postprocess)
178
+ dset = (dset
179
+ .batched(self.batch_size, partial=False,
180
+ collation_fn=dict_collation_fn)
181
+ )
182
+
183
+ loader = wds.WebLoader(dset, batch_size=None, shuffle=False,
184
+ num_workers=self.num_workers)
185
+
186
+ return loader
187
+
188
+ def filter_size(self, x):
189
+ try:
190
+ valid = True
191
+ if self.min_size is not None and self.min_size > 1:
192
+ try:
193
+ valid = valid and x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size
194
+ except Exception:
195
+ valid = False
196
+ if self.max_pwatermark is not None and self.max_pwatermark < 1.0:
197
+ try:
198
+ valid = valid and x['json']['pwatermark'] <= self.max_pwatermark
199
+ except Exception:
200
+ valid = False
201
+ return valid
202
+ except Exception:
203
+ return False
204
+
205
+ def filter_keys(self, x):
206
+ try:
207
+ return ("jpg" in x) and ("txt" in x)
208
+ except Exception:
209
+ return False
210
+
211
+ def train_dataloader(self):
212
+ return self.make_loader(self.train)
213
+
214
+ def val_dataloader(self):
215
+ return self.make_loader(self.validation, train=False)
216
+
217
+ def test_dataloader(self):
218
+ return self.make_loader(self.test, train=False)
219
+
220
+
221
+ from ldm.modules.image_degradation import degradation_fn_bsr_light
222
+ import cv2
223
+
224
+ class AddLR(object):
225
+ def __init__(self, factor, output_size, initial_size=None, image_key="jpg"):
226
+ self.factor = factor
227
+ self.output_size = output_size
228
+ self.image_key = image_key
229
+ self.initial_size = initial_size
230
+
231
+ def pt2np(self, x):
232
+ x = ((x+1.0)*127.5).clamp(0, 255).to(dtype=torch.uint8).detach().cpu().numpy()
233
+ return x
234
+
235
+ def np2pt(self, x):
236
+ x = torch.from_numpy(x)/127.5-1.0
237
+ return x
238
+
239
+ def __call__(self, sample):
240
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
241
+ x = self.pt2np(sample[self.image_key])
242
+ if self.initial_size is not None:
243
+ x = cv2.resize(x, (self.initial_size, self.initial_size), interpolation=2)
244
+ x = degradation_fn_bsr_light(x, sf=self.factor)['image']
245
+ x = cv2.resize(x, (self.output_size, self.output_size), interpolation=2)
246
+ x = self.np2pt(x)
247
+ sample['lr'] = x
248
+ return sample
249
+
250
+ class AddBW(object):
251
+ def __init__(self, image_key="jpg"):
252
+ self.image_key = image_key
253
+
254
+ def pt2np(self, x):
255
+ x = ((x+1.0)*127.5).clamp(0, 255).to(dtype=torch.uint8).detach().cpu().numpy()
256
+ return x
257
+
258
+ def np2pt(self, x):
259
+ x = torch.from_numpy(x)/127.5-1.0
260
+ return x
261
+
262
+ def __call__(self, sample):
263
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
264
+ x = sample[self.image_key]
265
+ w = torch.rand(3, device=x.device)
266
+ w /= w.sum()
267
+ out = torch.einsum('hwc,c->hw', x, w)
268
+
269
+ # Keep as 3ch so we can pass to encoder, also we might want to add hints
270
+ sample['lr'] = out.unsqueeze(-1).tile(1,1,3)
271
+ return sample
272
+
273
+ class AddMask(PRNGMixin):
274
+ def __init__(self, mode="512train", p_drop=0.):
275
+ super().__init__()
276
+ assert mode in list(MASK_MODES.keys()), f'unknown mask generation mode "{mode}"'
277
+ self.make_mask = MASK_MODES[mode]
278
+ self.p_drop = p_drop
279
+
280
+ def __call__(self, sample):
281
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
282
+ x = sample['jpg']
283
+ mask = self.make_mask(self.prng, x.shape[0], x.shape[1])
284
+ if self.prng.choice(2, p=[1 - self.p_drop, self.p_drop]):
285
+ mask = np.ones_like(mask)
286
+ mask[mask < 0.5] = 0
287
+ mask[mask > 0.5] = 1
288
+ mask = torch.from_numpy(mask[..., None])
289
+ sample['mask'] = mask
290
+ sample['masked_image'] = x * (mask < 0.5)
291
+ return sample
292
+
293
+
294
+ class AddEdge(PRNGMixin):
295
+ def __init__(self, mode="512train", mask_edges=True):
296
+ super().__init__()
297
+ assert mode in list(MASK_MODES.keys()), f'unknown mask generation mode "{mode}"'
298
+ self.make_mask = MASK_MODES[mode]
299
+ self.n_down_choices = [0]
300
+ self.sigma_choices = [1, 2]
301
+ self.mask_edges = mask_edges
302
+
303
+ @torch.no_grad()
304
+ def __call__(self, sample):
305
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
306
+ x = sample['jpg']
307
+
308
+ mask = self.make_mask(self.prng, x.shape[0], x.shape[1])
309
+ mask[mask < 0.5] = 0
310
+ mask[mask > 0.5] = 1
311
+ mask = torch.from_numpy(mask[..., None])
312
+ sample['mask'] = mask
313
+
314
+ n_down_idx = self.prng.choice(len(self.n_down_choices))
315
+ sigma_idx = self.prng.choice(len(self.sigma_choices))
316
+
317
+ n_choices = len(self.n_down_choices)*len(self.sigma_choices)
318
+ raveled_idx = np.ravel_multi_index((n_down_idx, sigma_idx),
319
+ (len(self.n_down_choices), len(self.sigma_choices)))
320
+ normalized_idx = raveled_idx/max(1, n_choices-1)
321
+
322
+ n_down = self.n_down_choices[n_down_idx]
323
+ sigma = self.sigma_choices[sigma_idx]
324
+
325
+ kernel_size = 4*sigma+1
326
+ kernel_size = (kernel_size, kernel_size)
327
+ sigma = (sigma, sigma)
328
+ canny = kornia.filters.Canny(
329
+ low_threshold=0.1,
330
+ high_threshold=0.2,
331
+ kernel_size=kernel_size,
332
+ sigma=sigma,
333
+ hysteresis=True,
334
+ )
335
+ y = (x+1.0)/2.0 # in 01
336
+ y = y.unsqueeze(0).permute(0, 3, 1, 2).contiguous()
337
+
338
+ # down
339
+ for i_down in range(n_down):
340
+ size = min(y.shape[-2], y.shape[-1])//2
341
+ y = kornia.geometry.transform.resize(y, size, antialias=True)
342
+
343
+ # edge
344
+ _, y = canny(y)
345
+
346
+ if n_down > 0:
347
+ size = x.shape[0], x.shape[1]
348
+ y = kornia.geometry.transform.resize(y, size, interpolation="nearest")
349
+
350
+ y = y.permute(0, 2, 3, 1)[0].expand(-1, -1, 3).contiguous()
351
+ y = y*2.0-1.0
352
+
353
+ if self.mask_edges:
354
+ sample['masked_image'] = y * (mask < 0.5)
355
+ else:
356
+ sample['masked_image'] = y
357
+ sample['mask'] = torch.zeros_like(sample['mask'])
358
+
359
+ # concat normalized idx
360
+ sample['smoothing_strength'] = torch.ones_like(sample['mask'])*normalized_idx
361
+
362
+ return sample
363
+
364
+
365
+ def example00():
366
+ url = "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/000000.tar -"
367
+ dataset = wds.WebDataset(url)
368
+ example = next(iter(dataset))
369
+ for k in example:
370
+ print(k, type(example[k]))
371
+
372
+ print(example["__key__"])
373
+ for k in ["json", "txt"]:
374
+ print(example[k].decode())
375
+
376
+ image = Image.open(io.BytesIO(example["jpg"]))
377
+ outdir = "tmp"
378
+ os.makedirs(outdir, exist_ok=True)
379
+ image.save(os.path.join(outdir, example["__key__"] + ".png"))
380
+
381
+
382
+ def load_example(example):
383
+ return {
384
+ "key": example["__key__"],
385
+ "image": Image.open(io.BytesIO(example["jpg"])),
386
+ "text": example["txt"].decode(),
387
+ }
388
+
389
+
390
+ for i, example in tqdm(enumerate(dataset)):
391
+ ex = load_example(example)
392
+ print(ex["image"].size, ex["text"])
393
+ if i >= 100:
394
+ break
395
+
396
+
397
+ def example01():
398
+ # the first laion shards contain ~10k examples each
399
+ url = "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/{000000..000002}.tar -"
400
+
401
+ batch_size = 3
402
+ shuffle_buffer = 10000
403
+ dset = wds.WebDataset(
404
+ url,
405
+ nodesplitter=wds.shardlists.split_by_node,
406
+ shardshuffle=True,
407
+ )
408
+ dset = (dset
409
+ .shuffle(shuffle_buffer, initial=shuffle_buffer)
410
+ .decode('pil', handler=warn_and_continue)
411
+ .batched(batch_size, partial=False,
412
+ collation_fn=dict_collation_fn)
413
+ )
414
+
415
+ num_workers = 2
416
+ loader = wds.WebLoader(dset, batch_size=None, shuffle=False, num_workers=num_workers)
417
+
418
+ batch_sizes = list()
419
+ keys_per_epoch = list()
420
+ for epoch in range(5):
421
+ keys = list()
422
+ for batch in tqdm(loader):
423
+ batch_sizes.append(len(batch["__key__"]))
424
+ keys.append(batch["__key__"])
425
+
426
+ for bs in batch_sizes:
427
+ assert bs==batch_size
428
+ print(f"{len(batch_sizes)} batches of size {batch_size}.")
429
+ batch_sizes = list()
430
+
431
+ keys_per_epoch.append(keys)
432
+ for i_batch in [0, 1, -1]:
433
+ print(f"Batch {i_batch} of epoch {epoch}:")
434
+ print(keys[i_batch])
435
+ print("next epoch.")
436
+
437
+
438
+ def example02():
439
+ from omegaconf import OmegaConf
440
+ from torch.utils.data.distributed import DistributedSampler
441
+ from torch.utils.data import IterableDataset
442
+ from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler
443
+ from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator
444
+
445
+ #config = OmegaConf.load("configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml")
446
+ #config = OmegaConf.load("configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024.yaml")
447
+ config = OmegaConf.load("configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml")
448
+ datamod = WebDataModuleFromConfig(**config["data"]["params"])
449
+ dataloader = datamod.train_dataloader()
450
+
451
+ for batch in dataloader:
452
+ print(batch.keys())
453
+ print(batch["jpg"].shape)
454
+ break
455
+
456
+
457
+ def example03():
458
+ # improved aesthetics
459
+ tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{000000..060207}.tar -"
460
+ dataset = wds.WebDataset(tars)
461
+
462
+ def filter_keys(x):
463
+ try:
464
+ return ("jpg" in x) and ("txt" in x)
465
+ except Exception:
466
+ return False
467
+
468
+ def filter_size(x):
469
+ try:
470
+ return x['json']['original_width'] >= 512 and x['json']['original_height'] >= 512
471
+ except Exception:
472
+ return False
473
+
474
+ def filter_watermark(x):
475
+ try:
476
+ return x['json']['pwatermark'] < 0.5
477
+ except Exception:
478
+ return False
479
+
480
+ dataset = (dataset
481
+ .select(filter_keys)
482
+ .decode('pil', handler=wds.warn_and_continue))
483
+ n_save = 20
484
+ n_total = 0
485
+ n_large = 0
486
+ n_large_nowm = 0
487
+ for i, example in enumerate(dataset):
488
+ n_total += 1
489
+ if filter_size(example):
490
+ n_large += 1
491
+ if filter_watermark(example):
492
+ n_large_nowm += 1
493
+ if n_large_nowm < n_save+1:
494
+ image = example["jpg"]
495
+ image.save(os.path.join("tmp", f"{n_large_nowm-1:06}.png"))
496
+
497
+ if i%500 == 0:
498
+ print(i)
499
+ print(f"Large: {n_large}/{n_total} | {n_large/n_total*100:.2f}%")
500
+ if n_large > 0:
501
+ print(f"No Watermark: {n_large_nowm}/{n_large} | {n_large_nowm/n_large*100:.2f}%")
502
+
503
+
504
+
505
+ def example04():
506
+ # improved aesthetics
507
+ for i_shard in range(60208)[::-1]:
508
+ print(i_shard)
509
+ tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{:06}.tar -".format(i_shard)
510
+ dataset = wds.WebDataset(tars)
511
+
512
+ def filter_keys(x):
513
+ try:
514
+ return ("jpg" in x) and ("txt" in x)
515
+ except Exception:
516
+ return False
517
+
518
+ def filter_size(x):
519
+ try:
520
+ return x['json']['original_width'] >= 512 and x['json']['original_height'] >= 512
521
+ except Exception:
522
+ return False
523
+
524
+ dataset = (dataset
525
+ .select(filter_keys)
526
+ .decode('pil', handler=wds.warn_and_continue))
527
+ try:
528
+ example = next(iter(dataset))
529
+ except Exception:
530
+ print(f"Error @ {i_shard}")
531
+
532
+
533
+ if __name__ == "__main__":
534
+ #example01()
535
+ #example02()
536
+ example03()
537
+ #example04()
ldm/data/lsun.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import PIL
4
+ from PIL import Image
5
+ from torch.utils.data import Dataset
6
+ from torchvision import transforms
7
+
8
+
9
+ class LSUNBase(Dataset):
10
+ def __init__(self,
11
+ txt_file,
12
+ data_root,
13
+ size=None,
14
+ interpolation="bicubic",
15
+ flip_p=0.5
16
+ ):
17
+ self.data_paths = txt_file
18
+ self.data_root = data_root
19
+ with open(self.data_paths, "r") as f:
20
+ self.image_paths = f.read().splitlines()
21
+ self._length = len(self.image_paths)
22
+ self.labels = {
23
+ "relative_file_path_": [l for l in self.image_paths],
24
+ "file_path_": [os.path.join(self.data_root, l)
25
+ for l in self.image_paths],
26
+ }
27
+
28
+ self.size = size
29
+ self.interpolation = {"linear": PIL.Image.LINEAR,
30
+ "bilinear": PIL.Image.BILINEAR,
31
+ "bicubic": PIL.Image.BICUBIC,
32
+ "lanczos": PIL.Image.LANCZOS,
33
+ }[interpolation]
34
+ self.flip = transforms.RandomHorizontalFlip(p=flip_p)
35
+
36
+ def __len__(self):
37
+ return self._length
38
+
39
+ def __getitem__(self, i):
40
+ example = dict((k, self.labels[k][i]) for k in self.labels)
41
+ image = Image.open(example["file_path_"])
42
+ if not image.mode == "RGB":
43
+ image = image.convert("RGB")
44
+
45
+ # default to score-sde preprocessing
46
+ img = np.array(image).astype(np.uint8)
47
+ crop = min(img.shape[0], img.shape[1])
48
+ h, w, = img.shape[0], img.shape[1]
49
+ img = img[(h - crop) // 2:(h + crop) // 2,
50
+ (w - crop) // 2:(w + crop) // 2]
51
+
52
+ image = Image.fromarray(img)
53
+ if self.size is not None:
54
+ image = image.resize((self.size, self.size), resample=self.interpolation)
55
+
56
+ image = self.flip(image)
57
+ image = np.array(image).astype(np.uint8)
58
+ example["image"] = (image / 127.5 - 1.0).astype(np.float32)
59
+ return example
60
+
61
+
62
+ class LSUNChurchesTrain(LSUNBase):
63
+ def __init__(self, **kwargs):
64
+ super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
65
+
66
+
67
+ class LSUNChurchesValidation(LSUNBase):
68
+ def __init__(self, flip_p=0., **kwargs):
69
+ super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
70
+ flip_p=flip_p, **kwargs)
71
+
72
+
73
+ class LSUNBedroomsTrain(LSUNBase):
74
+ def __init__(self, **kwargs):
75
+ super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
76
+
77
+
78
+ class LSUNBedroomsValidation(LSUNBase):
79
+ def __init__(self, flip_p=0.0, **kwargs):
80
+ super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
81
+ flip_p=flip_p, **kwargs)
82
+
83
+
84
+ class LSUNCatsTrain(LSUNBase):
85
+ def __init__(self, **kwargs):
86
+ super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
87
+
88
+
89
+ class LSUNCatsValidation(LSUNBase):
90
+ def __init__(self, flip_p=0., **kwargs):
91
+ super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
92
+ flip_p=flip_p, **kwargs)
ldm/data/nerf_like.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ import os
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ import imageio
7
+ import math
8
+ import cv2
9
+ from torchvision import transforms
10
+
11
+ def cartesian_to_spherical(xyz):
12
+ ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
13
+ xy = xyz[:,0]**2 + xyz[:,1]**2
14
+ z = np.sqrt(xy + xyz[:,2]**2)
15
+ theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
16
+ #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
17
+ azimuth = np.arctan2(xyz[:,1], xyz[:,0])
18
+ return np.array([theta, azimuth, z])
19
+
20
+
21
+ def get_T(T_target, T_cond):
22
+ theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :])
23
+ theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
24
+
25
+ d_theta = theta_target - theta_cond
26
+ d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
27
+ d_z = z_target - z_cond
28
+
29
+ d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
30
+ return d_T
31
+
32
+ def get_spherical(T_target, T_cond):
33
+ theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :])
34
+ theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
35
+
36
+ d_theta = theta_target - theta_cond
37
+ d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
38
+ d_z = z_target - z_cond
39
+
40
+ d_T = torch.tensor([math.degrees(d_theta.item()), math.degrees(d_azimuth.item()), d_z.item()])
41
+ return d_T
42
+
43
+ class RTMV(Dataset):
44
+ def __init__(self, root_dir='datasets/RTMV/google_scanned',\
45
+ first_K=64, resolution=256, load_target=False):
46
+ self.root_dir = root_dir
47
+ self.scene_list = sorted(next(os.walk(root_dir))[1])
48
+ self.resolution = resolution
49
+ self.first_K = first_K
50
+ self.load_target = load_target
51
+
52
+ def __len__(self):
53
+ return len(self.scene_list)
54
+
55
+ def __getitem__(self, idx):
56
+ scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
57
+ with open(os.path.join(scene_dir, 'transforms.json'), "r") as f:
58
+ meta = json.load(f)
59
+ imgs = []
60
+ poses = []
61
+ for i_img in range(self.first_K):
62
+ meta_img = meta['frames'][i_img]
63
+
64
+ if i_img == 0 or self.load_target:
65
+ img_path = os.path.join(scene_dir, meta_img['file_path'])
66
+ img = imageio.imread(img_path)
67
+ img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
68
+ imgs.append(img)
69
+
70
+ c2w = meta_img['transform_matrix']
71
+ poses.append(c2w)
72
+
73
+ imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs
74
+ imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
75
+ imgs = imgs * 2 - 1. # convert to stable diffusion range
76
+ poses = torch.tensor(np.array(poses).astype(np.float32))
77
+ return imgs, poses
78
+
79
+ def blend_rgba(self, img):
80
+ img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB
81
+ return img
82
+
83
+
84
+ class GSO(Dataset):
85
+ def __init__(self, root_dir='datasets/GoogleScannedObjects',\
86
+ split='val', first_K=5, resolution=256, load_target=False, name='render_mvs'):
87
+ self.root_dir = root_dir
88
+ with open(os.path.join(root_dir, '%s.json' % split), "r") as f:
89
+ self.scene_list = json.load(f)
90
+ self.resolution = resolution
91
+ self.first_K = first_K
92
+ self.load_target = load_target
93
+ self.name = name
94
+
95
+ def __len__(self):
96
+ return len(self.scene_list)
97
+
98
+ def __getitem__(self, idx):
99
+ scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
100
+ with open(os.path.join(scene_dir, 'transforms_%s.json' % self.name), "r") as f:
101
+ meta = json.load(f)
102
+ imgs = []
103
+ poses = []
104
+ for i_img in range(self.first_K):
105
+ meta_img = meta['frames'][i_img]
106
+
107
+ if i_img == 0 or self.load_target:
108
+ img_path = os.path.join(scene_dir, meta_img['file_path'])
109
+ img = imageio.imread(img_path)
110
+ img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
111
+ imgs.append(img)
112
+
113
+ c2w = meta_img['transform_matrix']
114
+ poses.append(c2w)
115
+
116
+ imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs
117
+ mask = imgs[:, :, :, -1]
118
+ imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
119
+ imgs = imgs * 2 - 1. # convert to stable diffusion range
120
+ poses = torch.tensor(np.array(poses).astype(np.float32))
121
+ return imgs, poses
122
+
123
+ def blend_rgba(self, img):
124
+ img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB
125
+ return img
126
+
127
+ class WILD(Dataset):
128
+ def __init__(self, root_dir='data/nerf_wild',\
129
+ first_K=33, resolution=256, load_target=False):
130
+ self.root_dir = root_dir
131
+ self.scene_list = sorted(next(os.walk(root_dir))[1])
132
+ self.resolution = resolution
133
+ self.first_K = first_K
134
+ self.load_target = load_target
135
+
136
+ def __len__(self):
137
+ return len(self.scene_list)
138
+
139
+ def __getitem__(self, idx):
140
+ scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
141
+ with open(os.path.join(scene_dir, 'transforms_train.json'), "r") as f:
142
+ meta = json.load(f)
143
+ imgs = []
144
+ poses = []
145
+ for i_img in range(self.first_K):
146
+ meta_img = meta['frames'][i_img]
147
+
148
+ if i_img == 0 or self.load_target:
149
+ img_path = os.path.join(scene_dir, meta_img['file_path'])
150
+ img = imageio.imread(img_path + '.png')
151
+ img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
152
+ imgs.append(img)
153
+
154
+ c2w = meta_img['transform_matrix']
155
+ poses.append(c2w)
156
+
157
+ imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs
158
+ imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
159
+ imgs = imgs * 2 - 1. # convert to stable diffusion range
160
+ poses = torch.tensor(np.array(poses).astype(np.float32))
161
+ return imgs, poses
162
+
163
+ def blend_rgba(self, img):
164
+ img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB
165
+ return img
ldm/data/simple.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ import webdataset as wds
3
+ import numpy as np
4
+ from omegaconf import DictConfig, ListConfig
5
+ import torch
6
+ from torch.utils.data import Dataset
7
+ from pathlib import Path
8
+ import json
9
+ from PIL import Image
10
+ from torchvision import transforms
11
+ import torchvision
12
+ from einops import rearrange
13
+ from ldm.util import instantiate_from_config
14
+ from datasets import load_dataset
15
+ import pytorch_lightning as pl
16
+ import copy
17
+ import csv
18
+ import cv2
19
+ import random
20
+ import matplotlib.pyplot as plt
21
+ from torch.utils.data import DataLoader
22
+ import json
23
+ import os
24
+ import webdataset as wds
25
+ import math
26
+ from torch.utils.data.distributed import DistributedSampler
27
+
28
+ # Some hacky things to make experimentation easier
29
+ def make_transform_multi_folder_data(paths, caption_files=None, **kwargs):
30
+ ds = make_multi_folder_data(paths, caption_files, **kwargs)
31
+ return TransformDataset(ds)
32
+
33
+ def make_nfp_data(base_path):
34
+ dirs = list(Path(base_path).glob("*/"))
35
+ print(f"Found {len(dirs)} folders")
36
+ print(dirs)
37
+ tforms = [transforms.Resize(512), transforms.CenterCrop(512)]
38
+ datasets = [NfpDataset(x, image_transforms=copy.copy(tforms), default_caption="A view from a train window") for x in dirs]
39
+ return torch.utils.data.ConcatDataset(datasets)
40
+
41
+
42
+ class VideoDataset(Dataset):
43
+ def __init__(self, root_dir, image_transforms, caption_file, offset=8, n=2):
44
+ self.root_dir = Path(root_dir)
45
+ self.caption_file = caption_file
46
+ self.n = n
47
+ ext = "mp4"
48
+ self.paths = sorted(list(self.root_dir.rglob(f"*.{ext}")))
49
+ self.offset = offset
50
+
51
+ if isinstance(image_transforms, ListConfig):
52
+ image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
53
+ image_transforms.extend([transforms.ToTensor(),
54
+ transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
55
+ image_transforms = transforms.Compose(image_transforms)
56
+ self.tform = image_transforms
57
+ with open(self.caption_file) as f:
58
+ reader = csv.reader(f)
59
+ rows = [row for row in reader]
60
+ self.captions = dict(rows)
61
+
62
+ def __len__(self):
63
+ return len(self.paths)
64
+
65
+ def __getitem__(self, index):
66
+ for i in range(10):
67
+ try:
68
+ return self._load_sample(index)
69
+ except Exception:
70
+ # Not really good enough but...
71
+ print("uh oh")
72
+
73
+ def _load_sample(self, index):
74
+ n = self.n
75
+ filename = self.paths[index]
76
+ min_frame = 2*self.offset + 2
77
+ vid = cv2.VideoCapture(str(filename))
78
+ max_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
79
+ curr_frame_n = random.randint(min_frame, max_frames)
80
+ vid.set(cv2.CAP_PROP_POS_FRAMES,curr_frame_n)
81
+ _, curr_frame = vid.read()
82
+
83
+ prev_frames = []
84
+ for i in range(n):
85
+ prev_frame_n = curr_frame_n - (i+1)*self.offset
86
+ vid.set(cv2.CAP_PROP_POS_FRAMES,prev_frame_n)
87
+ _, prev_frame = vid.read()
88
+ prev_frame = self.tform(Image.fromarray(prev_frame[...,::-1]))
89
+ prev_frames.append(prev_frame)
90
+
91
+ vid.release()
92
+ caption = self.captions[filename.name]
93
+ data = {
94
+ "image": self.tform(Image.fromarray(curr_frame[...,::-1])),
95
+ "prev": torch.cat(prev_frames, dim=-1),
96
+ "txt": caption
97
+ }
98
+ return data
99
+
100
+ # end hacky things
101
+
102
+
103
+ def make_tranforms(image_transforms):
104
+ # if isinstance(image_transforms, ListConfig):
105
+ # image_transforms = [instantiate_from_config(tt) for tt in image_transforms]
106
+ image_transforms = []
107
+ image_transforms.extend([transforms.ToTensor(),
108
+ transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
109
+ image_transforms = transforms.Compose(image_transforms)
110
+ return image_transforms
111
+
112
+
113
+ def make_multi_folder_data(paths, caption_files=None, **kwargs):
114
+ """Make a concat dataset from multiple folders
115
+ Don't suport captions yet
116
+
117
+ If paths is a list, that's ok, if it's a Dict interpret it as:
118
+ k=folder v=n_times to repeat that
119
+ """
120
+ list_of_paths = []
121
+ if isinstance(paths, (Dict, DictConfig)):
122
+ assert caption_files is None, \
123
+ "Caption files not yet supported for repeats"
124
+ for folder_path, repeats in paths.items():
125
+ list_of_paths.extend([folder_path]*repeats)
126
+ paths = list_of_paths
127
+
128
+ if caption_files is not None:
129
+ datasets = [FolderData(p, caption_file=c, **kwargs) for (p, c) in zip(paths, caption_files)]
130
+ else:
131
+ datasets = [FolderData(p, **kwargs) for p in paths]
132
+ return torch.utils.data.ConcatDataset(datasets)
133
+
134
+
135
+
136
+ class NfpDataset(Dataset):
137
+ def __init__(self,
138
+ root_dir,
139
+ image_transforms=[],
140
+ ext="jpg",
141
+ default_caption="",
142
+ ) -> None:
143
+ """assume sequential frames and a deterministic transform"""
144
+
145
+ self.root_dir = Path(root_dir)
146
+ self.default_caption = default_caption
147
+
148
+ self.paths = sorted(list(self.root_dir.rglob(f"*.{ext}")))
149
+ self.tform = make_tranforms(image_transforms)
150
+
151
+ def __len__(self):
152
+ return len(self.paths) - 1
153
+
154
+
155
+ def __getitem__(self, index):
156
+ prev = self.paths[index]
157
+ curr = self.paths[index+1]
158
+ data = {}
159
+ data["image"] = self._load_im(curr)
160
+ data["prev"] = self._load_im(prev)
161
+ data["txt"] = self.default_caption
162
+ return data
163
+
164
+ def _load_im(self, filename):
165
+ im = Image.open(filename).convert("RGB")
166
+ return self.tform(im)
167
+
168
+ class ObjaverseDataModuleFromConfig(pl.LightningDataModule):
169
+ def __init__(self, root_dir, batch_size, total_view, train=None, validation=None,
170
+ test=None, num_workers=4, **kwargs):
171
+ super().__init__(self)
172
+ self.root_dir = root_dir
173
+ self.batch_size = batch_size
174
+ self.num_workers = num_workers
175
+ self.total_view = total_view
176
+
177
+ if train is not None:
178
+ dataset_config = train
179
+ if validation is not None:
180
+ dataset_config = validation
181
+
182
+ if 'image_transforms' in dataset_config:
183
+ image_transforms = [torchvision.transforms.Resize(dataset_config.image_transforms.size)]
184
+ else:
185
+ image_transforms = []
186
+ image_transforms.extend([transforms.ToTensor(),
187
+ transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
188
+ self.image_transforms = torchvision.transforms.Compose(image_transforms)
189
+
190
+
191
+ def train_dataloader(self):
192
+ dataset = ObjaverseData(root_dir=self.root_dir, total_view=self.total_view, validation=False, \
193
+ image_transforms=self.image_transforms)
194
+ sampler = DistributedSampler(dataset)
195
+ return wds.WebLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
196
+
197
+ def val_dataloader(self):
198
+ dataset = ObjaverseData(root_dir=self.root_dir, total_view=self.total_view, validation=True, \
199
+ image_transforms=self.image_transforms)
200
+ sampler = DistributedSampler(dataset)
201
+ return wds.WebLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
202
+
203
+ def test_dataloader(self):
204
+ return wds.WebLoader(ObjaverseData(root_dir=self.root_dir, total_view=self.total_view, validation=self.validation),\
205
+ batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
206
+
207
+
208
+ class ObjaverseData(Dataset):
209
+ def __init__(self,
210
+ root_dir='.objaverse/hf-objaverse-v1/views',
211
+ image_transforms=[],
212
+ ext="png",
213
+ default_trans=torch.zeros(3),
214
+ postprocess=None,
215
+ return_paths=False,
216
+ total_view=4,
217
+ validation=False
218
+ ) -> None:
219
+ """Create a dataset from a folder of images.
220
+ If you pass in a root directory it will be searched for images
221
+ ending in ext (ext can be a list)
222
+ """
223
+ self.root_dir = Path(root_dir)
224
+ self.default_trans = default_trans
225
+ self.return_paths = return_paths
226
+ if isinstance(postprocess, DictConfig):
227
+ postprocess = instantiate_from_config(postprocess)
228
+ self.postprocess = postprocess
229
+ self.total_view = total_view
230
+
231
+ if not isinstance(ext, (tuple, list, ListConfig)):
232
+ ext = [ext]
233
+
234
+ with open(os.path.join(root_dir, 'valid_paths.json')) as f:
235
+ self.paths = json.load(f)
236
+
237
+ total_objects = len(self.paths)
238
+ if validation:
239
+ self.paths = self.paths[math.floor(total_objects / 100. * 99.):] # used last 1% as validation
240
+ else:
241
+ self.paths = self.paths[:math.floor(total_objects / 100. * 99.)] # used first 99% as training
242
+ print('============= length of dataset %d =============' % len(self.paths))
243
+ self.tform = image_transforms
244
+
245
+ def __len__(self):
246
+ return len(self.paths)
247
+
248
+ def cartesian_to_spherical(self, xyz):
249
+ ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
250
+ xy = xyz[:,0]**2 + xyz[:,1]**2
251
+ z = np.sqrt(xy + xyz[:,2]**2)
252
+ theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
253
+ #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
254
+ azimuth = np.arctan2(xyz[:,1], xyz[:,0])
255
+ return np.array([theta, azimuth, z])
256
+
257
+ def get_T(self, target_RT, cond_RT):
258
+ R, T = target_RT[:3, :3], target_RT[:, -1]
259
+ T_target = -R.T @ T
260
+
261
+ R, T = cond_RT[:3, :3], cond_RT[:, -1]
262
+ T_cond = -R.T @ T
263
+
264
+ theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
265
+ theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
266
+
267
+ d_theta = theta_target - theta_cond
268
+ d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
269
+ d_z = z_target - z_cond
270
+
271
+ d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
272
+ return d_T
273
+
274
+ def load_im(self, path, color):
275
+ '''
276
+ replace background pixel with random color in rendering
277
+ '''
278
+ try:
279
+ img = plt.imread(path)
280
+ except:
281
+ print(path)
282
+ sys.exit()
283
+ img[img[:, :, -1] == 0.] = color
284
+ img = Image.fromarray(np.uint8(img[:, :, :3] * 255.))
285
+ return img
286
+
287
+ def __getitem__(self, index):
288
+
289
+ data = {}
290
+ if self.paths[index][-2:] == '_1': # dirty fix for rendering dataset twice
291
+ total_view = 8
292
+ else:
293
+ total_view = 4
294
+ index_target, index_cond = random.sample(range(total_view), 2) # without replacement
295
+ filename = os.path.join(self.root_dir, self.paths[index])
296
+
297
+ # print(self.paths[index])
298
+
299
+ if self.return_paths:
300
+ data["path"] = str(filename)
301
+
302
+ color = [1., 1., 1., 1.]
303
+
304
+ try:
305
+ target_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_target), color))
306
+ cond_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_cond), color))
307
+ target_RT = np.load(os.path.join(filename, '%03d.npy' % index_target))
308
+ cond_RT = np.load(os.path.join(filename, '%03d.npy' % index_cond))
309
+ except:
310
+ # very hacky solution, sorry about this
311
+ filename = os.path.join(self.root_dir, '692db5f2d3a04bb286cb977a7dba903e_1') # this one we know is valid
312
+ target_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_target), color))
313
+ cond_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_cond), color))
314
+ target_RT = np.load(os.path.join(filename, '%03d.npy' % index_target))
315
+ cond_RT = np.load(os.path.join(filename, '%03d.npy' % index_cond))
316
+ target_im = torch.zeros_like(target_im)
317
+ cond_im = torch.zeros_like(cond_im)
318
+
319
+ data["image_target"] = target_im
320
+ data["image_cond"] = cond_im
321
+ data["T"] = self.get_T(target_RT, cond_RT)
322
+
323
+ if self.postprocess is not None:
324
+ data = self.postprocess(data)
325
+
326
+ return data
327
+
328
+ def process_im(self, im):
329
+ im = im.convert("RGB")
330
+ return self.tform(im)
331
+
332
+ class FolderData(Dataset):
333
+ def __init__(self,
334
+ root_dir,
335
+ caption_file=None,
336
+ image_transforms=[],
337
+ ext="jpg",
338
+ default_caption="",
339
+ postprocess=None,
340
+ return_paths=False,
341
+ ) -> None:
342
+ """Create a dataset from a folder of images.
343
+ If you pass in a root directory it will be searched for images
344
+ ending in ext (ext can be a list)
345
+ """
346
+ self.root_dir = Path(root_dir)
347
+ self.default_caption = default_caption
348
+ self.return_paths = return_paths
349
+ if isinstance(postprocess, DictConfig):
350
+ postprocess = instantiate_from_config(postprocess)
351
+ self.postprocess = postprocess
352
+ if caption_file is not None:
353
+ with open(caption_file, "rt") as f:
354
+ ext = Path(caption_file).suffix.lower()
355
+ if ext == ".json":
356
+ captions = json.load(f)
357
+ elif ext == ".jsonl":
358
+ lines = f.readlines()
359
+ lines = [json.loads(x) for x in lines]
360
+ captions = {x["file_name"]: x["text"].strip("\n") for x in lines}
361
+ else:
362
+ raise ValueError(f"Unrecognised format: {ext}")
363
+ self.captions = captions
364
+ else:
365
+ self.captions = None
366
+
367
+ if not isinstance(ext, (tuple, list, ListConfig)):
368
+ ext = [ext]
369
+
370
+ # Only used if there is no caption file
371
+ self.paths = []
372
+ for e in ext:
373
+ self.paths.extend(sorted(list(self.root_dir.rglob(f"*.{e}"))))
374
+ self.tform = make_tranforms(image_transforms)
375
+
376
+ def __len__(self):
377
+ if self.captions is not None:
378
+ return len(self.captions.keys())
379
+ else:
380
+ return len(self.paths)
381
+
382
+ def __getitem__(self, index):
383
+ data = {}
384
+ if self.captions is not None:
385
+ chosen = list(self.captions.keys())[index]
386
+ caption = self.captions.get(chosen, None)
387
+ if caption is None:
388
+ caption = self.default_caption
389
+ filename = self.root_dir/chosen
390
+ else:
391
+ filename = self.paths[index]
392
+
393
+ if self.return_paths:
394
+ data["path"] = str(filename)
395
+
396
+ im = Image.open(filename).convert("RGB")
397
+ im = self.process_im(im)
398
+ data["image"] = im
399
+
400
+ if self.captions is not None:
401
+ data["txt"] = caption
402
+ else:
403
+ data["txt"] = self.default_caption
404
+
405
+ if self.postprocess is not None:
406
+ data = self.postprocess(data)
407
+
408
+ return data
409
+
410
+ def process_im(self, im):
411
+ im = im.convert("RGB")
412
+ return self.tform(im)
413
+ import random
414
+
415
+ class TransformDataset():
416
+ def __init__(self, ds, extra_label="sksbspic"):
417
+ self.ds = ds
418
+ self.extra_label = extra_label
419
+ self.transforms = {
420
+ "align": transforms.Resize(768),
421
+ "centerzoom": transforms.CenterCrop(768),
422
+ "randzoom": transforms.RandomCrop(768),
423
+ }
424
+
425
+
426
+ def __getitem__(self, index):
427
+ data = self.ds[index]
428
+
429
+ im = data['image']
430
+ im = im.permute(2,0,1)
431
+ # In case data is smaller than expected
432
+ im = transforms.Resize(1024)(im)
433
+
434
+ tform_name = random.choice(list(self.transforms.keys()))
435
+ im = self.transforms[tform_name](im)
436
+
437
+ im = im.permute(1,2,0)
438
+
439
+ data['image'] = im
440
+ data['txt'] = data['txt'] + f" {self.extra_label} {tform_name}"
441
+
442
+ return data
443
+
444
+ def __len__(self):
445
+ return len(self.ds)
446
+
447
+ def hf_dataset(
448
+ name,
449
+ image_transforms=[],
450
+ image_column="image",
451
+ text_column="text",
452
+ split='train',
453
+ image_key='image',
454
+ caption_key='txt',
455
+ ):
456
+ """Make huggingface dataset with appropriate list of transforms applied
457
+ """
458
+ ds = load_dataset(name, split=split)
459
+ tform = make_tranforms(image_transforms)
460
+
461
+ assert image_column in ds.column_names, f"Didn't find column {image_column} in {ds.column_names}"
462
+ assert text_column in ds.column_names, f"Didn't find column {text_column} in {ds.column_names}"
463
+
464
+ def pre_process(examples):
465
+ processed = {}
466
+ processed[image_key] = [tform(im) for im in examples[image_column]]
467
+ processed[caption_key] = examples[text_column]
468
+ return processed
469
+
470
+ ds.set_transform(pre_process)
471
+ return ds
472
+
473
+ class TextOnly(Dataset):
474
+ def __init__(self, captions, output_size, image_key="image", caption_key="txt", n_gpus=1):
475
+ """Returns only captions with dummy images"""
476
+ self.output_size = output_size
477
+ self.image_key = image_key
478
+ self.caption_key = caption_key
479
+ if isinstance(captions, Path):
480
+ self.captions = self._load_caption_file(captions)
481
+ else:
482
+ self.captions = captions
483
+
484
+ if n_gpus > 1:
485
+ # hack to make sure that all the captions appear on each gpu
486
+ repeated = [n_gpus*[x] for x in self.captions]
487
+ self.captions = []
488
+ [self.captions.extend(x) for x in repeated]
489
+
490
+ def __len__(self):
491
+ return len(self.captions)
492
+
493
+ def __getitem__(self, index):
494
+ dummy_im = torch.zeros(3, self.output_size, self.output_size)
495
+ dummy_im = rearrange(dummy_im * 2. - 1., 'c h w -> h w c')
496
+ return {self.image_key: dummy_im, self.caption_key: self.captions[index]}
497
+
498
+ def _load_caption_file(self, filename):
499
+ with open(filename, 'rt') as f:
500
+ captions = f.readlines()
501
+ return [x.strip('\n') for x in captions]
502
+
503
+
504
+
505
+ import random
506
+ import json
507
+ class IdRetreivalDataset(FolderData):
508
+ def __init__(self, ret_file, *args, **kwargs):
509
+ super().__init__(*args, **kwargs)
510
+ with open(ret_file, "rt") as f:
511
+ self.ret = json.load(f)
512
+
513
+ def __getitem__(self, index):
514
+ data = super().__getitem__(index)
515
+ key = self.paths[index].name
516
+ matches = self.ret[key]
517
+ if len(matches) > 0:
518
+ retreived = random.choice(matches)
519
+ else:
520
+ retreived = key
521
+ filename = self.root_dir/retreived
522
+ im = Image.open(filename).convert("RGB")
523
+ im = self.process_im(im)
524
+ # data["match"] = im
525
+ data["match"] = torch.cat((data["image"], im), dim=-1)
526
+ return data
ldm/extras.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from omegaconf import OmegaConf
3
+ import torch
4
+ from ldm.util import instantiate_from_config
5
+ import logging
6
+ from contextlib import contextmanager
7
+
8
+ from contextlib import contextmanager
9
+ import logging
10
+
11
+ @contextmanager
12
+ def all_logging_disabled(highest_level=logging.CRITICAL):
13
+ """
14
+ A context manager that will prevent any logging messages
15
+ triggered during the body from being processed.
16
+
17
+ :param highest_level: the maximum logging level in use.
18
+ This would only need to be changed if a custom level greater than CRITICAL
19
+ is defined.
20
+
21
+ https://gist.github.com/simon-weber/7853144
22
+ """
23
+ # two kind-of hacks here:
24
+ # * can't get the highest logging level in effect => delegate to the user
25
+ # * can't get the current module-level override => use an undocumented
26
+ # (but non-private!) interface
27
+
28
+ previous_level = logging.root.manager.disable
29
+
30
+ logging.disable(highest_level)
31
+
32
+ try:
33
+ yield
34
+ finally:
35
+ logging.disable(previous_level)
36
+
37
+ def load_training_dir(train_dir, device, epoch="last"):
38
+ """Load a checkpoint and config from training directory"""
39
+ train_dir = Path(train_dir)
40
+ ckpt = list(train_dir.rglob(f"*{epoch}.ckpt"))
41
+ assert len(ckpt) == 1, f"found {len(ckpt)} matching ckpt files"
42
+ config = list(train_dir.rglob(f"*-project.yaml"))
43
+ assert len(ckpt) > 0, f"didn't find any config in {train_dir}"
44
+ if len(config) > 1:
45
+ print(f"found {len(config)} matching config files")
46
+ config = sorted(config)[-1]
47
+ print(f"selecting {config}")
48
+ else:
49
+ config = config[0]
50
+
51
+
52
+ config = OmegaConf.load(config)
53
+ return load_model_from_config(config, ckpt[0], device)
54
+
55
+ def load_model_from_config(config, ckpt, device="cpu", verbose=False):
56
+ """Loads a model from config and a ckpt
57
+ if config is a path will use omegaconf to load
58
+ """
59
+ if isinstance(config, (str, Path)):
60
+ config = OmegaConf.load(config)
61
+
62
+ with all_logging_disabled():
63
+ print(f"Loading model from {ckpt}")
64
+ pl_sd = torch.load(ckpt, map_location="cpu")
65
+ global_step = pl_sd["global_step"]
66
+ sd = pl_sd["state_dict"]
67
+ model = instantiate_from_config(config.model)
68
+ m, u = model.load_state_dict(sd, strict=False)
69
+ if len(m) > 0 and verbose:
70
+ print("missing keys:")
71
+ print(m)
72
+ if len(u) > 0 and verbose:
73
+ print("unexpected keys:")
74
+ model.to(device)
75
+ model.eval()
76
+ model.cond_stage_model.device = device
77
+ return model
ldm/guidance.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from scipy import interpolate
3
+ import numpy as np
4
+ import torch
5
+ import matplotlib.pyplot as plt
6
+ from IPython.display import clear_output
7
+ import abc
8
+
9
+
10
+ class GuideModel(torch.nn.Module, abc.ABC):
11
+ def __init__(self) -> None:
12
+ super().__init__()
13
+
14
+ @abc.abstractmethod
15
+ def preprocess(self, x_img):
16
+ pass
17
+
18
+ @abc.abstractmethod
19
+ def compute_loss(self, inp):
20
+ pass
21
+
22
+
23
+ class Guider(torch.nn.Module):
24
+ def __init__(self, sampler, guide_model, scale=1.0, verbose=False):
25
+ """Apply classifier guidance
26
+
27
+ Specify a guidance scale as either a scalar
28
+ Or a schedule as a list of tuples t = 0->1 and scale, e.g.
29
+ [(0, 10), (0.5, 20), (1, 50)]
30
+ """
31
+ super().__init__()
32
+ self.sampler = sampler
33
+ self.index = 0
34
+ self.show = verbose
35
+ self.guide_model = guide_model
36
+ self.history = []
37
+
38
+ if isinstance(scale, (Tuple, List)):
39
+ times = np.array([x[0] for x in scale])
40
+ values = np.array([x[1] for x in scale])
41
+ self.scale_schedule = {"times": times, "values": values}
42
+ else:
43
+ self.scale_schedule = float(scale)
44
+
45
+ self.ddim_timesteps = sampler.ddim_timesteps
46
+ self.ddpm_num_timesteps = sampler.ddpm_num_timesteps
47
+
48
+
49
+ def get_scales(self):
50
+ if isinstance(self.scale_schedule, float):
51
+ return len(self.ddim_timesteps)*[self.scale_schedule]
52
+
53
+ interpolater = interpolate.interp1d(self.scale_schedule["times"], self.scale_schedule["values"])
54
+ fractional_steps = np.array(self.ddim_timesteps)/self.ddpm_num_timesteps
55
+ return interpolater(fractional_steps)
56
+
57
+ def modify_score(self, model, e_t, x, t, c):
58
+
59
+ # TODO look up index by t
60
+ scale = self.get_scales()[self.index]
61
+
62
+ if (scale == 0):
63
+ return e_t
64
+
65
+ sqrt_1ma = self.sampler.ddim_sqrt_one_minus_alphas[self.index].to(x.device)
66
+ with torch.enable_grad():
67
+ x_in = x.detach().requires_grad_(True)
68
+ pred_x0 = model.predict_start_from_noise(x_in, t=t, noise=e_t)
69
+ x_img = model.first_stage_model.decode((1/0.18215)*pred_x0)
70
+
71
+ inp = self.guide_model.preprocess(x_img)
72
+ loss = self.guide_model.compute_loss(inp)
73
+ grads = torch.autograd.grad(loss.sum(), x_in)[0]
74
+ correction = grads * scale
75
+
76
+ if self.show:
77
+ clear_output(wait=True)
78
+ print(loss.item(), scale, correction.abs().max().item(), e_t.abs().max().item())
79
+ self.history.append([loss.item(), scale, correction.min().item(), correction.max().item()])
80
+ plt.imshow((inp[0].detach().permute(1,2,0).clamp(-1,1).cpu()+1)/2)
81
+ plt.axis('off')
82
+ plt.show()
83
+ plt.imshow(correction[0][0].detach().cpu())
84
+ plt.axis('off')
85
+ plt.show()
86
+
87
+
88
+ e_t_mod = e_t - sqrt_1ma*correction
89
+ if self.show:
90
+ fig, axs = plt.subplots(1, 3)
91
+ axs[0].imshow(e_t[0][0].detach().cpu(), vmin=-2, vmax=+2)
92
+ axs[1].imshow(e_t_mod[0][0].detach().cpu(), vmin=-2, vmax=+2)
93
+ axs[2].imshow(correction[0][0].detach().cpu(), vmin=-2, vmax=+2)
94
+ plt.show()
95
+ self.index += 1
96
+ return e_t_mod
ldm/lr_scheduler.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class LambdaWarmUpCosineScheduler:
5
+ """
6
+ note: use with a base_lr of 1.0
7
+ """
8
+ def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
9
+ self.lr_warm_up_steps = warm_up_steps
10
+ self.lr_start = lr_start
11
+ self.lr_min = lr_min
12
+ self.lr_max = lr_max
13
+ self.lr_max_decay_steps = max_decay_steps
14
+ self.last_lr = 0.
15
+ self.verbosity_interval = verbosity_interval
16
+
17
+ def schedule(self, n, **kwargs):
18
+ if self.verbosity_interval > 0:
19
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20
+ if n < self.lr_warm_up_steps:
21
+ lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22
+ self.last_lr = lr
23
+ return lr
24
+ else:
25
+ t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26
+ t = min(t, 1.0)
27
+ lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28
+ 1 + np.cos(t * np.pi))
29
+ self.last_lr = lr
30
+ return lr
31
+
32
+ def __call__(self, n, **kwargs):
33
+ return self.schedule(n,**kwargs)
34
+
35
+
36
+ class LambdaWarmUpCosineScheduler2:
37
+ """
38
+ supports repeated iterations, configurable via lists
39
+ note: use with a base_lr of 1.0.
40
+ """
41
+ def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42
+ assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43
+ self.lr_warm_up_steps = warm_up_steps
44
+ self.f_start = f_start
45
+ self.f_min = f_min
46
+ self.f_max = f_max
47
+ self.cycle_lengths = cycle_lengths
48
+ self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49
+ self.last_f = 0.
50
+ self.verbosity_interval = verbosity_interval
51
+
52
+ def find_in_interval(self, n):
53
+ interval = 0
54
+ for cl in self.cum_cycles[1:]:
55
+ if n <= cl:
56
+ return interval
57
+ interval += 1
58
+
59
+ def schedule(self, n, **kwargs):
60
+ cycle = self.find_in_interval(n)
61
+ n = n - self.cum_cycles[cycle]
62
+ if self.verbosity_interval > 0:
63
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64
+ f"current cycle {cycle}")
65
+ if n < self.lr_warm_up_steps[cycle]:
66
+ f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67
+ self.last_f = f
68
+ return f
69
+ else:
70
+ t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71
+ t = min(t, 1.0)
72
+ f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73
+ 1 + np.cos(t * np.pi))
74
+ self.last_f = f
75
+ return f
76
+
77
+ def __call__(self, n, **kwargs):
78
+ return self.schedule(n, **kwargs)
79
+
80
+
81
+ class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82
+
83
+ def schedule(self, n, **kwargs):
84
+ cycle = self.find_in_interval(n)
85
+ n = n - self.cum_cycles[cycle]
86
+ if self.verbosity_interval > 0:
87
+ if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88
+ f"current cycle {cycle}")
89
+
90
+ if n < self.lr_warm_up_steps[cycle]:
91
+ f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92
+ self.last_f = f
93
+ return f
94
+ else:
95
+ f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96
+ self.last_f = f
97
+ return f
98
+
ldm/models/autoencoder.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pytorch_lightning as pl
3
+ import torch.nn.functional as F
4
+ from contextlib import contextmanager
5
+
6
+ from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
7
+
8
+ from ldm.modules.diffusionmodules.model import Encoder, Decoder
9
+ from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
10
+
11
+ from ldm.util import instantiate_from_config
12
+
13
+
14
+ class VQModel(pl.LightningModule):
15
+ def __init__(self,
16
+ ddconfig,
17
+ lossconfig,
18
+ n_embed,
19
+ embed_dim,
20
+ ckpt_path=None,
21
+ ignore_keys=[],
22
+ image_key="image",
23
+ colorize_nlabels=None,
24
+ monitor=None,
25
+ batch_resize_range=None,
26
+ scheduler_config=None,
27
+ lr_g_factor=1.0,
28
+ remap=None,
29
+ sane_index_shape=False, # tell vector quantizer to return indices as bhw
30
+ use_ema=False
31
+ ):
32
+ super().__init__()
33
+ self.embed_dim = embed_dim
34
+ self.n_embed = n_embed
35
+ self.image_key = image_key
36
+ self.encoder = Encoder(**ddconfig)
37
+ self.decoder = Decoder(**ddconfig)
38
+ self.loss = instantiate_from_config(lossconfig)
39
+ self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
40
+ remap=remap,
41
+ sane_index_shape=sane_index_shape)
42
+ self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
43
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
44
+ if colorize_nlabels is not None:
45
+ assert type(colorize_nlabels)==int
46
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
47
+ if monitor is not None:
48
+ self.monitor = monitor
49
+ self.batch_resize_range = batch_resize_range
50
+ if self.batch_resize_range is not None:
51
+ print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
52
+
53
+ self.use_ema = use_ema
54
+ if self.use_ema:
55
+ self.model_ema = LitEma(self)
56
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
57
+
58
+ if ckpt_path is not None:
59
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
60
+ self.scheduler_config = scheduler_config
61
+ self.lr_g_factor = lr_g_factor
62
+
63
+ @contextmanager
64
+ def ema_scope(self, context=None):
65
+ if self.use_ema:
66
+ self.model_ema.store(self.parameters())
67
+ self.model_ema.copy_to(self)
68
+ if context is not None:
69
+ print(f"{context}: Switched to EMA weights")
70
+ try:
71
+ yield None
72
+ finally:
73
+ if self.use_ema:
74
+ self.model_ema.restore(self.parameters())
75
+ if context is not None:
76
+ print(f"{context}: Restored training weights")
77
+
78
+ def init_from_ckpt(self, path, ignore_keys=list()):
79
+ sd = torch.load(path, map_location="cpu")["state_dict"]
80
+ keys = list(sd.keys())
81
+ for k in keys:
82
+ for ik in ignore_keys:
83
+ if k.startswith(ik):
84
+ print("Deleting key {} from state_dict.".format(k))
85
+ del sd[k]
86
+ missing, unexpected = self.load_state_dict(sd, strict=False)
87
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
88
+ if len(missing) > 0:
89
+ print(f"Missing Keys: {missing}")
90
+ print(f"Unexpected Keys: {unexpected}")
91
+
92
+ def on_train_batch_end(self, *args, **kwargs):
93
+ if self.use_ema:
94
+ self.model_ema(self)
95
+
96
+ def encode(self, x):
97
+ h = self.encoder(x)
98
+ h = self.quant_conv(h)
99
+ quant, emb_loss, info = self.quantize(h)
100
+ return quant, emb_loss, info
101
+
102
+ def encode_to_prequant(self, x):
103
+ h = self.encoder(x)
104
+ h = self.quant_conv(h)
105
+ return h
106
+
107
+ def decode(self, quant):
108
+ quant = self.post_quant_conv(quant)
109
+ dec = self.decoder(quant)
110
+ return dec
111
+
112
+ def decode_code(self, code_b):
113
+ quant_b = self.quantize.embed_code(code_b)
114
+ dec = self.decode(quant_b)
115
+ return dec
116
+
117
+ def forward(self, input, return_pred_indices=False):
118
+ quant, diff, (_,_,ind) = self.encode(input)
119
+ dec = self.decode(quant)
120
+ if return_pred_indices:
121
+ return dec, diff, ind
122
+ return dec, diff
123
+
124
+ def get_input(self, batch, k):
125
+ x = batch[k]
126
+ if len(x.shape) == 3:
127
+ x = x[..., None]
128
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
129
+ if self.batch_resize_range is not None:
130
+ lower_size = self.batch_resize_range[0]
131
+ upper_size = self.batch_resize_range[1]
132
+ if self.global_step <= 4:
133
+ # do the first few batches with max size to avoid later oom
134
+ new_resize = upper_size
135
+ else:
136
+ new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
137
+ if new_resize != x.shape[2]:
138
+ x = F.interpolate(x, size=new_resize, mode="bicubic")
139
+ x = x.detach()
140
+ return x
141
+
142
+ def training_step(self, batch, batch_idx, optimizer_idx):
143
+ # https://github.com/pytorch/pytorch/issues/37142
144
+ # try not to fool the heuristics
145
+ x = self.get_input(batch, self.image_key)
146
+ xrec, qloss, ind = self(x, return_pred_indices=True)
147
+
148
+ if optimizer_idx == 0:
149
+ # autoencode
150
+ aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
151
+ last_layer=self.get_last_layer(), split="train",
152
+ predicted_indices=ind)
153
+
154
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
155
+ return aeloss
156
+
157
+ if optimizer_idx == 1:
158
+ # discriminator
159
+ discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
160
+ last_layer=self.get_last_layer(), split="train")
161
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
162
+ return discloss
163
+
164
+ def validation_step(self, batch, batch_idx):
165
+ log_dict = self._validation_step(batch, batch_idx)
166
+ with self.ema_scope():
167
+ log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
168
+ return log_dict
169
+
170
+ def _validation_step(self, batch, batch_idx, suffix=""):
171
+ x = self.get_input(batch, self.image_key)
172
+ xrec, qloss, ind = self(x, return_pred_indices=True)
173
+ aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
174
+ self.global_step,
175
+ last_layer=self.get_last_layer(),
176
+ split="val"+suffix,
177
+ predicted_indices=ind
178
+ )
179
+
180
+ discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
181
+ self.global_step,
182
+ last_layer=self.get_last_layer(),
183
+ split="val"+suffix,
184
+ predicted_indices=ind
185
+ )
186
+ rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
187
+ self.log(f"val{suffix}/rec_loss", rec_loss,
188
+ prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
189
+ self.log(f"val{suffix}/aeloss", aeloss,
190
+ prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
191
+ if version.parse(pl.__version__) >= version.parse('1.4.0'):
192
+ del log_dict_ae[f"val{suffix}/rec_loss"]
193
+ self.log_dict(log_dict_ae)
194
+ self.log_dict(log_dict_disc)
195
+ return self.log_dict
196
+
197
+ def configure_optimizers(self):
198
+ lr_d = self.learning_rate
199
+ lr_g = self.lr_g_factor*self.learning_rate
200
+ print("lr_d", lr_d)
201
+ print("lr_g", lr_g)
202
+ opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
203
+ list(self.decoder.parameters())+
204
+ list(self.quantize.parameters())+
205
+ list(self.quant_conv.parameters())+
206
+ list(self.post_quant_conv.parameters()),
207
+ lr=lr_g, betas=(0.5, 0.9))
208
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
209
+ lr=lr_d, betas=(0.5, 0.9))
210
+
211
+ if self.scheduler_config is not None:
212
+ scheduler = instantiate_from_config(self.scheduler_config)
213
+
214
+ print("Setting up LambdaLR scheduler...")
215
+ scheduler = [
216
+ {
217
+ 'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
218
+ 'interval': 'step',
219
+ 'frequency': 1
220
+ },
221
+ {
222
+ 'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
223
+ 'interval': 'step',
224
+ 'frequency': 1
225
+ },
226
+ ]
227
+ return [opt_ae, opt_disc], scheduler
228
+ return [opt_ae, opt_disc], []
229
+
230
+ def get_last_layer(self):
231
+ return self.decoder.conv_out.weight
232
+
233
+ def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
234
+ log = dict()
235
+ x = self.get_input(batch, self.image_key)
236
+ x = x.to(self.device)
237
+ if only_inputs:
238
+ log["inputs"] = x
239
+ return log
240
+ xrec, _ = self(x)
241
+ if x.shape[1] > 3:
242
+ # colorize with random projection
243
+ assert xrec.shape[1] > 3
244
+ x = self.to_rgb(x)
245
+ xrec = self.to_rgb(xrec)
246
+ log["inputs"] = x
247
+ log["reconstructions"] = xrec
248
+ if plot_ema:
249
+ with self.ema_scope():
250
+ xrec_ema, _ = self(x)
251
+ if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
252
+ log["reconstructions_ema"] = xrec_ema
253
+ return log
254
+
255
+ def to_rgb(self, x):
256
+ assert self.image_key == "segmentation"
257
+ if not hasattr(self, "colorize"):
258
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
259
+ x = F.conv2d(x, weight=self.colorize)
260
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
261
+ return x
262
+
263
+
264
+ class VQModelInterface(VQModel):
265
+ def __init__(self, embed_dim, *args, **kwargs):
266
+ super().__init__(embed_dim=embed_dim, *args, **kwargs)
267
+ self.embed_dim = embed_dim
268
+
269
+ def encode(self, x):
270
+ h = self.encoder(x)
271
+ h = self.quant_conv(h)
272
+ return h
273
+
274
+ def decode(self, h, force_not_quantize=False):
275
+ # also go through quantization layer
276
+ if not force_not_quantize:
277
+ quant, emb_loss, info = self.quantize(h)
278
+ else:
279
+ quant = h
280
+ quant = self.post_quant_conv(quant)
281
+ dec = self.decoder(quant)
282
+ return dec
283
+
284
+
285
+ class AutoencoderKL(pl.LightningModule):
286
+ def __init__(self,
287
+ ddconfig,
288
+ lossconfig,
289
+ embed_dim,
290
+ ckpt_path=None,
291
+ ignore_keys=[],
292
+ image_key="image",
293
+ colorize_nlabels=None,
294
+ monitor=None,
295
+ ):
296
+ super().__init__()
297
+ self.image_key = image_key
298
+ self.encoder = Encoder(**ddconfig)
299
+ self.decoder = Decoder(**ddconfig)
300
+ self.loss = instantiate_from_config(lossconfig)
301
+ assert ddconfig["double_z"]
302
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
303
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
304
+ self.embed_dim = embed_dim
305
+ if colorize_nlabels is not None:
306
+ assert type(colorize_nlabels)==int
307
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
308
+ if monitor is not None:
309
+ self.monitor = monitor
310
+ if ckpt_path is not None:
311
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
312
+
313
+ def init_from_ckpt(self, path, ignore_keys=list()):
314
+ sd = torch.load(path, map_location="cpu")["state_dict"]
315
+ keys = list(sd.keys())
316
+ for k in keys:
317
+ for ik in ignore_keys:
318
+ if k.startswith(ik):
319
+ print("Deleting key {} from state_dict.".format(k))
320
+ del sd[k]
321
+ self.load_state_dict(sd, strict=False)
322
+ print(f"Restored from {path}")
323
+
324
+ def encode(self, x):
325
+ h = self.encoder(x)
326
+ moments = self.quant_conv(h)
327
+ posterior = DiagonalGaussianDistribution(moments)
328
+ return posterior
329
+
330
+ def decode(self, z):
331
+ z = self.post_quant_conv(z)
332
+ dec = self.decoder(z)
333
+ return dec
334
+
335
+ def forward(self, input, sample_posterior=True):
336
+ posterior = self.encode(input)
337
+ if sample_posterior:
338
+ z = posterior.sample()
339
+ else:
340
+ z = posterior.mode()
341
+ dec = self.decode(z)
342
+ return dec, posterior
343
+
344
+ def get_input(self, batch, k):
345
+ x = batch[k]
346
+ if len(x.shape) == 3:
347
+ x = x[..., None]
348
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
349
+ return x
350
+
351
+ def training_step(self, batch, batch_idx, optimizer_idx):
352
+ inputs = self.get_input(batch, self.image_key)
353
+ reconstructions, posterior = self(inputs)
354
+
355
+ if optimizer_idx == 0:
356
+ # train encoder+decoder+logvar
357
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
358
+ last_layer=self.get_last_layer(), split="train")
359
+ self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
360
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
361
+ return aeloss
362
+
363
+ if optimizer_idx == 1:
364
+ # train the discriminator
365
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
366
+ last_layer=self.get_last_layer(), split="train")
367
+
368
+ self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
369
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
370
+ return discloss
371
+
372
+ def validation_step(self, batch, batch_idx):
373
+ inputs = self.get_input(batch, self.image_key)
374
+ reconstructions, posterior = self(inputs)
375
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
376
+ last_layer=self.get_last_layer(), split="val")
377
+
378
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
379
+ last_layer=self.get_last_layer(), split="val")
380
+
381
+ self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
382
+ self.log_dict(log_dict_ae)
383
+ self.log_dict(log_dict_disc)
384
+ return self.log_dict
385
+
386
+ def configure_optimizers(self):
387
+ lr = self.learning_rate
388
+ opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
389
+ list(self.decoder.parameters())+
390
+ list(self.quant_conv.parameters())+
391
+ list(self.post_quant_conv.parameters()),
392
+ lr=lr, betas=(0.5, 0.9))
393
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
394
+ lr=lr, betas=(0.5, 0.9))
395
+ return [opt_ae, opt_disc], []
396
+
397
+ def get_last_layer(self):
398
+ return self.decoder.conv_out.weight
399
+
400
+ @torch.no_grad()
401
+ def log_images(self, batch, only_inputs=False, **kwargs):
402
+ log = dict()
403
+ x = self.get_input(batch, self.image_key)
404
+ x = x.to(self.device)
405
+ if not only_inputs:
406
+ xrec, posterior = self(x)
407
+ if x.shape[1] > 3:
408
+ # colorize with random projection
409
+ assert xrec.shape[1] > 3
410
+ x = self.to_rgb(x)
411
+ xrec = self.to_rgb(xrec)
412
+ log["samples"] = self.decode(torch.randn_like(posterior.sample()))
413
+ log["reconstructions"] = xrec
414
+ log["inputs"] = x
415
+ return log
416
+
417
+ def to_rgb(self, x):
418
+ assert self.image_key == "segmentation"
419
+ if not hasattr(self, "colorize"):
420
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
421
+ x = F.conv2d(x, weight=self.colorize)
422
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
423
+ return x
424
+
425
+
426
+ class IdentityFirstStage(torch.nn.Module):
427
+ def __init__(self, *args, vq_interface=False, **kwargs):
428
+ self.vq_interface = vq_interface # TODO: Should be true by default but check to not break older stuff
429
+ super().__init__()
430
+
431
+ def encode(self, x, *args, **kwargs):
432
+ return x
433
+
434
+ def decode(self, x, *args, **kwargs):
435
+ return x
436
+
437
+ def quantize(self, x, *args, **kwargs):
438
+ if self.vq_interface:
439
+ return x, None, [None, None, None]
440
+ return x
441
+
442
+ def forward(self, x, *args, **kwargs):
443
+ return x
ldm/models/diffusion/__init__.py ADDED
File without changes
ldm/models/diffusion/classifier.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pytorch_lightning as pl
4
+ from omegaconf import OmegaConf
5
+ from torch.nn import functional as F
6
+ from torch.optim import AdamW
7
+ from torch.optim.lr_scheduler import LambdaLR
8
+ from copy import deepcopy
9
+ from einops import rearrange
10
+ from glob import glob
11
+ from natsort import natsorted
12
+
13
+ from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
14
+ from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
15
+
16
+ __models__ = {
17
+ 'class_label': EncoderUNetModel,
18
+ 'segmentation': UNetModel
19
+ }
20
+
21
+
22
+ def disabled_train(self, mode=True):
23
+ """Overwrite model.train with this function to make sure train/eval mode
24
+ does not change anymore."""
25
+ return self
26
+
27
+
28
+ class NoisyLatentImageClassifier(pl.LightningModule):
29
+
30
+ def __init__(self,
31
+ diffusion_path,
32
+ num_classes,
33
+ ckpt_path=None,
34
+ pool='attention',
35
+ label_key=None,
36
+ diffusion_ckpt_path=None,
37
+ scheduler_config=None,
38
+ weight_decay=1.e-2,
39
+ log_steps=10,
40
+ monitor='val/loss',
41
+ *args,
42
+ **kwargs):
43
+ super().__init__(*args, **kwargs)
44
+ self.num_classes = num_classes
45
+ # get latest config of diffusion model
46
+ diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
47
+ self.diffusion_config = OmegaConf.load(diffusion_config).model
48
+ self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
49
+ self.load_diffusion()
50
+
51
+ self.monitor = monitor
52
+ self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
53
+ self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
54
+ self.log_steps = log_steps
55
+
56
+ self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
57
+ else self.diffusion_model.cond_stage_key
58
+
59
+ assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
60
+
61
+ if self.label_key not in __models__:
62
+ raise NotImplementedError()
63
+
64
+ self.load_classifier(ckpt_path, pool)
65
+
66
+ self.scheduler_config = scheduler_config
67
+ self.use_scheduler = self.scheduler_config is not None
68
+ self.weight_decay = weight_decay
69
+
70
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
71
+ sd = torch.load(path, map_location="cpu")
72
+ if "state_dict" in list(sd.keys()):
73
+ sd = sd["state_dict"]
74
+ keys = list(sd.keys())
75
+ for k in keys:
76
+ for ik in ignore_keys:
77
+ if k.startswith(ik):
78
+ print("Deleting key {} from state_dict.".format(k))
79
+ del sd[k]
80
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
81
+ sd, strict=False)
82
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
83
+ if len(missing) > 0:
84
+ print(f"Missing Keys: {missing}")
85
+ if len(unexpected) > 0:
86
+ print(f"Unexpected Keys: {unexpected}")
87
+
88
+ def load_diffusion(self):
89
+ model = instantiate_from_config(self.diffusion_config)
90
+ self.diffusion_model = model.eval()
91
+ self.diffusion_model.train = disabled_train
92
+ for param in self.diffusion_model.parameters():
93
+ param.requires_grad = False
94
+
95
+ def load_classifier(self, ckpt_path, pool):
96
+ model_config = deepcopy(self.diffusion_config.params.unet_config.params)
97
+ model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
98
+ model_config.out_channels = self.num_classes
99
+ if self.label_key == 'class_label':
100
+ model_config.pool = pool
101
+
102
+ self.model = __models__[self.label_key](**model_config)
103
+ if ckpt_path is not None:
104
+ print('#####################################################################')
105
+ print(f'load from ckpt "{ckpt_path}"')
106
+ print('#####################################################################')
107
+ self.init_from_ckpt(ckpt_path)
108
+
109
+ @torch.no_grad()
110
+ def get_x_noisy(self, x, t, noise=None):
111
+ noise = default(noise, lambda: torch.randn_like(x))
112
+ continuous_sqrt_alpha_cumprod = None
113
+ if self.diffusion_model.use_continuous_noise:
114
+ continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
115
+ # todo: make sure t+1 is correct here
116
+
117
+ return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
118
+ continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
119
+
120
+ def forward(self, x_noisy, t, *args, **kwargs):
121
+ return self.model(x_noisy, t)
122
+
123
+ @torch.no_grad()
124
+ def get_input(self, batch, k):
125
+ x = batch[k]
126
+ if len(x.shape) == 3:
127
+ x = x[..., None]
128
+ x = rearrange(x, 'b h w c -> b c h w')
129
+ x = x.to(memory_format=torch.contiguous_format).float()
130
+ return x
131
+
132
+ @torch.no_grad()
133
+ def get_conditioning(self, batch, k=None):
134
+ if k is None:
135
+ k = self.label_key
136
+ assert k is not None, 'Needs to provide label key'
137
+
138
+ targets = batch[k].to(self.device)
139
+
140
+ if self.label_key == 'segmentation':
141
+ targets = rearrange(targets, 'b h w c -> b c h w')
142
+ for down in range(self.numd):
143
+ h, w = targets.shape[-2:]
144
+ targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
145
+
146
+ # targets = rearrange(targets,'b c h w -> b h w c')
147
+
148
+ return targets
149
+
150
+ def compute_top_k(self, logits, labels, k, reduction="mean"):
151
+ _, top_ks = torch.topk(logits, k, dim=1)
152
+ if reduction == "mean":
153
+ return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
154
+ elif reduction == "none":
155
+ return (top_ks == labels[:, None]).float().sum(dim=-1)
156
+
157
+ def on_train_epoch_start(self):
158
+ # save some memory
159
+ self.diffusion_model.model.to('cpu')
160
+
161
+ @torch.no_grad()
162
+ def write_logs(self, loss, logits, targets):
163
+ log_prefix = 'train' if self.training else 'val'
164
+ log = {}
165
+ log[f"{log_prefix}/loss"] = loss.mean()
166
+ log[f"{log_prefix}/acc@1"] = self.compute_top_k(
167
+ logits, targets, k=1, reduction="mean"
168
+ )
169
+ log[f"{log_prefix}/acc@5"] = self.compute_top_k(
170
+ logits, targets, k=5, reduction="mean"
171
+ )
172
+
173
+ self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
174
+ self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
175
+ self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
176
+ lr = self.optimizers().param_groups[0]['lr']
177
+ self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
178
+
179
+ def shared_step(self, batch, t=None):
180
+ x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
181
+ targets = self.get_conditioning(batch)
182
+ if targets.dim() == 4:
183
+ targets = targets.argmax(dim=1)
184
+ if t is None:
185
+ t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
186
+ else:
187
+ t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
188
+ x_noisy = self.get_x_noisy(x, t)
189
+ logits = self(x_noisy, t)
190
+
191
+ loss = F.cross_entropy(logits, targets, reduction='none')
192
+
193
+ self.write_logs(loss.detach(), logits.detach(), targets.detach())
194
+
195
+ loss = loss.mean()
196
+ return loss, logits, x_noisy, targets
197
+
198
+ def training_step(self, batch, batch_idx):
199
+ loss, *_ = self.shared_step(batch)
200
+ return loss
201
+
202
+ def reset_noise_accs(self):
203
+ self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
204
+ range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
205
+
206
+ def on_validation_start(self):
207
+ self.reset_noise_accs()
208
+
209
+ @torch.no_grad()
210
+ def validation_step(self, batch, batch_idx):
211
+ loss, *_ = self.shared_step(batch)
212
+
213
+ for t in self.noisy_acc:
214
+ _, logits, _, targets = self.shared_step(batch, t)
215
+ self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
216
+ self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
217
+
218
+ return loss
219
+
220
+ def configure_optimizers(self):
221
+ optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
222
+
223
+ if self.use_scheduler:
224
+ scheduler = instantiate_from_config(self.scheduler_config)
225
+
226
+ print("Setting up LambdaLR scheduler...")
227
+ scheduler = [
228
+ {
229
+ 'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
230
+ 'interval': 'step',
231
+ 'frequency': 1
232
+ }]
233
+ return [optimizer], scheduler
234
+
235
+ return optimizer
236
+
237
+ @torch.no_grad()
238
+ def log_images(self, batch, N=8, *args, **kwargs):
239
+ log = dict()
240
+ x = self.get_input(batch, self.diffusion_model.first_stage_key)
241
+ log['inputs'] = x
242
+
243
+ y = self.get_conditioning(batch)
244
+
245
+ if self.label_key == 'class_label':
246
+ y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
247
+ log['labels'] = y
248
+
249
+ if ismap(y):
250
+ log['labels'] = self.diffusion_model.to_rgb(y)
251
+
252
+ for step in range(self.log_steps):
253
+ current_time = step * self.log_time_interval
254
+
255
+ _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
256
+
257
+ log[f'inputs@t{current_time}'] = x_noisy
258
+
259
+ pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
260
+ pred = rearrange(pred, 'b h w c -> b c h w')
261
+
262
+ log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
263
+
264
+ for key in log:
265
+ log[key] = log[key][:N]
266
+
267
+ return log
ldm/models/diffusion/ddim.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+ from einops import rearrange
8
+
9
+ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
10
+ from ldm.models.diffusion.sampling_util import renorm_thresholding, norm_thresholding, spatial_norm_thresholding
11
+
12
+
13
+ class DDIMSampler(object):
14
+ def __init__(self, model, schedule="linear", **kwargs):
15
+ super().__init__()
16
+ self.model = model
17
+ self.ddpm_num_timesteps = model.num_timesteps
18
+ self.schedule = schedule
19
+
20
+ def to(self, device):
21
+ """Same as to in torch module
22
+ Don't really underestand why this isn't a module in the first place"""
23
+ for k, v in self.__dict__.items():
24
+ if isinstance(v, torch.Tensor):
25
+ new_v = getattr(self, k).to(device)
26
+ setattr(self, k, new_v)
27
+
28
+
29
+ def register_buffer(self, name, attr):
30
+ if type(attr) == torch.Tensor:
31
+ if attr.device != torch.device("cuda"):
32
+ attr = attr.to(torch.device("cuda"))
33
+ setattr(self, name, attr)
34
+
35
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
36
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
37
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
38
+ alphas_cumprod = self.model.alphas_cumprod
39
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
40
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
41
+
42
+ self.register_buffer('betas', to_torch(self.model.betas))
43
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
44
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
45
+
46
+ # calculations for diffusion q(x_t | x_{t-1}) and others
47
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
48
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
49
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
50
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
51
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
52
+
53
+ # ddim sampling parameters
54
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
55
+ ddim_timesteps=self.ddim_timesteps,
56
+ eta=ddim_eta,verbose=verbose)
57
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
58
+ self.register_buffer('ddim_alphas', ddim_alphas)
59
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
60
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
61
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
62
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
63
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
64
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
65
+
66
+ @torch.no_grad()
67
+ def sample(self,
68
+ S,
69
+ batch_size,
70
+ shape,
71
+ conditioning=None,
72
+ callback=None,
73
+ normals_sequence=None,
74
+ img_callback=None,
75
+ quantize_x0=False,
76
+ eta=0.,
77
+ mask=None,
78
+ x0=None,
79
+ temperature=1.,
80
+ noise_dropout=0.,
81
+ score_corrector=None,
82
+ corrector_kwargs=None,
83
+ verbose=True,
84
+ x_T=None,
85
+ log_every_t=100,
86
+ unconditional_guidance_scale=1.,
87
+ unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
88
+ dynamic_threshold=None,
89
+ **kwargs
90
+ ):
91
+ if conditioning is not None:
92
+ if isinstance(conditioning, dict):
93
+ ctmp = conditioning[list(conditioning.keys())[0]]
94
+ while isinstance(ctmp, list): ctmp = ctmp[0]
95
+ cbs = ctmp.shape[0]
96
+ if cbs != batch_size:
97
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
98
+
99
+ else:
100
+ if conditioning.shape[0] != batch_size:
101
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
102
+
103
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
104
+ # sampling
105
+ C, H, W = shape
106
+ size = (batch_size, C, H, W)
107
+ print(f'Data shape for DDIM sampling is {size}, eta {eta}')
108
+
109
+ samples, intermediates = self.ddim_sampling(conditioning, size,
110
+ callback=callback,
111
+ img_callback=img_callback,
112
+ quantize_denoised=quantize_x0,
113
+ mask=mask, x0=x0,
114
+ ddim_use_original_steps=False,
115
+ noise_dropout=noise_dropout,
116
+ temperature=temperature,
117
+ score_corrector=score_corrector,
118
+ corrector_kwargs=corrector_kwargs,
119
+ x_T=x_T,
120
+ log_every_t=log_every_t,
121
+ unconditional_guidance_scale=unconditional_guidance_scale,
122
+ unconditional_conditioning=unconditional_conditioning,
123
+ dynamic_threshold=dynamic_threshold,
124
+ )
125
+ return samples, intermediates
126
+
127
+ @torch.no_grad()
128
+ def ddim_sampling(self, cond, shape,
129
+ x_T=None, ddim_use_original_steps=False,
130
+ callback=None, timesteps=None, quantize_denoised=False,
131
+ mask=None, x0=None, img_callback=None, log_every_t=100,
132
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
133
+ unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
134
+ t_start=-1):
135
+ device = self.model.betas.device
136
+ b = shape[0]
137
+ if x_T is None:
138
+ img = torch.randn(shape, device=device)
139
+ else:
140
+ img = x_T
141
+
142
+ if timesteps is None:
143
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
144
+ elif timesteps is not None and not ddim_use_original_steps:
145
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
146
+ timesteps = self.ddim_timesteps[:subset_end]
147
+
148
+ timesteps = timesteps[:t_start]
149
+
150
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
151
+ time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
152
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
153
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
154
+
155
+ iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
156
+
157
+ for i, step in enumerate(iterator):
158
+ index = total_steps - i - 1
159
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
160
+
161
+ if mask is not None:
162
+ assert x0 is not None
163
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
164
+ img = img_orig * mask + (1. - mask) * img
165
+
166
+ outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
167
+ quantize_denoised=quantize_denoised, temperature=temperature,
168
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
169
+ corrector_kwargs=corrector_kwargs,
170
+ unconditional_guidance_scale=unconditional_guidance_scale,
171
+ unconditional_conditioning=unconditional_conditioning,
172
+ dynamic_threshold=dynamic_threshold)
173
+ img, pred_x0 = outs
174
+ if callback:
175
+ img = callback(i, img, pred_x0)
176
+ if img_callback: img_callback(pred_x0, i)
177
+
178
+ if index % log_every_t == 0 or index == total_steps - 1:
179
+ intermediates['x_inter'].append(img)
180
+ intermediates['pred_x0'].append(pred_x0)
181
+
182
+ return img, intermediates
183
+
184
+ @torch.no_grad()
185
+ def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
186
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
187
+ unconditional_guidance_scale=1., unconditional_conditioning=None,
188
+ dynamic_threshold=None):
189
+ b, *_, device = *x.shape, x.device
190
+
191
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
192
+ e_t = self.model.apply_model(x, t, c)
193
+ else:
194
+ x_in = torch.cat([x] * 2)
195
+ t_in = torch.cat([t] * 2)
196
+ if isinstance(c, dict):
197
+ assert isinstance(unconditional_conditioning, dict)
198
+ c_in = dict()
199
+ for k in c:
200
+ if isinstance(c[k], list):
201
+ c_in[k] = [torch.cat([
202
+ unconditional_conditioning[k][i],
203
+ c[k][i]]) for i in range(len(c[k]))]
204
+ else:
205
+ c_in[k] = torch.cat([
206
+ unconditional_conditioning[k],
207
+ c[k]])
208
+ else:
209
+ c_in = torch.cat([unconditional_conditioning, c])
210
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
211
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
212
+
213
+ if score_corrector is not None:
214
+ assert self.model.parameterization == "eps"
215
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
216
+
217
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
218
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
219
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
220
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
221
+ # select parameters corresponding to the currently considered timestep
222
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
223
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
224
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
225
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
226
+
227
+ # current prediction for x_0
228
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
229
+ if quantize_denoised:
230
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
231
+
232
+ if dynamic_threshold is not None:
233
+ pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
234
+
235
+ # direction pointing to x_t
236
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
237
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
238
+ if noise_dropout > 0.:
239
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
240
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
241
+ return x_prev, pred_x0
242
+
243
+ @torch.no_grad()
244
+ def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
245
+ unconditional_guidance_scale=1.0, unconditional_conditioning=None):
246
+ num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0]
247
+
248
+ assert t_enc <= num_reference_steps
249
+ num_steps = t_enc
250
+
251
+ if use_original_steps:
252
+ alphas_next = self.alphas_cumprod[:num_steps]
253
+ alphas = self.alphas_cumprod_prev[:num_steps]
254
+ else:
255
+ alphas_next = self.ddim_alphas[:num_steps]
256
+ alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
257
+
258
+ x_next = x0
259
+ intermediates = []
260
+ inter_steps = []
261
+ for i in tqdm(range(num_steps), desc='Encoding Image'):
262
+ t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long)
263
+ if unconditional_guidance_scale == 1.:
264
+ noise_pred = self.model.apply_model(x_next, t, c)
265
+ else:
266
+ assert unconditional_conditioning is not None
267
+ e_t_uncond, noise_pred = torch.chunk(
268
+ self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
269
+ torch.cat((unconditional_conditioning, c))), 2)
270
+ noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
271
+
272
+ xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
273
+ weighted_noise_pred = alphas_next[i].sqrt() * (
274
+ (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
275
+ x_next = xt_weighted + weighted_noise_pred
276
+ if return_intermediates and i % (
277
+ num_steps // return_intermediates) == 0 and i < num_steps - 1:
278
+ intermediates.append(x_next)
279
+ inter_steps.append(i)
280
+ elif return_intermediates and i >= num_steps - 2:
281
+ intermediates.append(x_next)
282
+ inter_steps.append(i)
283
+
284
+ out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
285
+ if return_intermediates:
286
+ out.update({'intermediates': intermediates})
287
+ return x_next, out
288
+
289
+ @torch.no_grad()
290
+ def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
291
+ # fast, but does not allow for exact reconstruction
292
+ # t serves as an index to gather the correct alphas
293
+ if use_original_steps:
294
+ sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
295
+ sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
296
+ else:
297
+ sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
298
+ sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
299
+
300
+ if noise is None:
301
+ noise = torch.randn_like(x0)
302
+ return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
303
+ extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
304
+
305
+ @torch.no_grad()
306
+ def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
307
+ use_original_steps=False):
308
+
309
+ timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
310
+ timesteps = timesteps[:t_start]
311
+
312
+ time_range = np.flip(timesteps)
313
+ total_steps = timesteps.shape[0]
314
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
315
+
316
+ iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
317
+ x_dec = x_latent
318
+ for i, step in enumerate(iterator):
319
+ index = total_steps - i - 1
320
+ ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
321
+ x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
322
+ unconditional_guidance_scale=unconditional_guidance_scale,
323
+ unconditional_conditioning=unconditional_conditioning)
324
+ return x_dec
ldm/models/diffusion/ddpm.py ADDED
@@ -0,0 +1,1994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ wild mixture of
3
+ https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
4
+ https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
5
+ https://github.com/CompVis/taming-transformers
6
+ -- merci
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import numpy as np
12
+ import pytorch_lightning as pl
13
+ from torch.optim.lr_scheduler import LambdaLR
14
+ from einops import rearrange, repeat
15
+ from contextlib import contextmanager, nullcontext
16
+ from functools import partial
17
+ import itertools
18
+ from tqdm import tqdm
19
+ from torchvision.utils import make_grid
20
+ from pytorch_lightning.utilities.distributed import rank_zero_only
21
+ from omegaconf import ListConfig
22
+
23
+ from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
24
+ from ldm.modules.ema import LitEma
25
+ from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
26
+ from ldm.models.autoencoder import VQModelInterface, IdentityFirstStage, AutoencoderKL
27
+ from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
28
+ from ldm.models.diffusion.ddim import DDIMSampler
29
+ from ldm.modules.attention import CrossAttention
30
+
31
+
32
+ __conditioning_keys__ = {'concat': 'c_concat',
33
+ 'crossattn': 'c_crossattn',
34
+ 'adm': 'y'}
35
+
36
+
37
+ def disabled_train(self, mode=True):
38
+ """Overwrite model.train with this function to make sure train/eval mode
39
+ does not change anymore."""
40
+ return self
41
+
42
+
43
+ def uniform_on_device(r1, r2, shape, device):
44
+ return (r1 - r2) * torch.rand(*shape, device=device) + r2
45
+
46
+
47
+ class DDPM(pl.LightningModule):
48
+ # classic DDPM with Gaussian diffusion, in image space
49
+ def __init__(self,
50
+ unet_config,
51
+ timesteps=1000,
52
+ beta_schedule="linear",
53
+ loss_type="l2",
54
+ ckpt_path=None,
55
+ ignore_keys=[],
56
+ load_only_unet=False,
57
+ monitor="val/loss",
58
+ use_ema=True,
59
+ first_stage_key="image",
60
+ image_size=256,
61
+ channels=3,
62
+ log_every_t=100,
63
+ clip_denoised=True,
64
+ linear_start=1e-4,
65
+ linear_end=2e-2,
66
+ cosine_s=8e-3,
67
+ given_betas=None,
68
+ original_elbo_weight=0.,
69
+ v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
70
+ l_simple_weight=1.,
71
+ conditioning_key=None,
72
+ parameterization="eps", # all assuming fixed variance schedules
73
+ scheduler_config=None,
74
+ use_positional_encodings=False,
75
+ learn_logvar=False,
76
+ logvar_init=0.,
77
+ make_it_fit=False,
78
+ ucg_training=None,
79
+ ):
80
+ super().__init__()
81
+ assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
82
+ self.parameterization = parameterization
83
+ print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
84
+ self.cond_stage_model = None
85
+ self.clip_denoised = clip_denoised
86
+ self.log_every_t = log_every_t
87
+ self.first_stage_key = first_stage_key
88
+ self.image_size = image_size # try conv?
89
+ self.channels = channels
90
+ self.use_positional_encodings = use_positional_encodings
91
+ self.model = DiffusionWrapper(unet_config, conditioning_key)
92
+ count_params(self.model, verbose=True)
93
+ self.use_ema = use_ema
94
+ if self.use_ema:
95
+ self.model_ema = LitEma(self.model)
96
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
97
+
98
+ self.use_scheduler = scheduler_config is not None
99
+ if self.use_scheduler:
100
+ self.scheduler_config = scheduler_config
101
+
102
+ self.v_posterior = v_posterior
103
+ self.original_elbo_weight = original_elbo_weight
104
+ self.l_simple_weight = l_simple_weight
105
+
106
+ if monitor is not None:
107
+ self.monitor = monitor
108
+ self.make_it_fit = make_it_fit
109
+ if ckpt_path is not None:
110
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
111
+
112
+ self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
113
+ linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
114
+
115
+ self.loss_type = loss_type
116
+
117
+ self.learn_logvar = learn_logvar
118
+ self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
119
+ if self.learn_logvar:
120
+ self.logvar = nn.Parameter(self.logvar, requires_grad=True)
121
+
122
+ self.ucg_training = ucg_training or dict()
123
+ if self.ucg_training:
124
+ self.ucg_prng = np.random.RandomState()
125
+
126
+ def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
127
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
128
+ if exists(given_betas):
129
+ betas = given_betas
130
+ else:
131
+ betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
132
+ cosine_s=cosine_s)
133
+ alphas = 1. - betas
134
+ alphas_cumprod = np.cumprod(alphas, axis=0)
135
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
136
+
137
+ timesteps, = betas.shape
138
+ self.num_timesteps = int(timesteps)
139
+ self.linear_start = linear_start
140
+ self.linear_end = linear_end
141
+ assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
142
+
143
+ to_torch = partial(torch.tensor, dtype=torch.float32)
144
+
145
+ self.register_buffer('betas', to_torch(betas))
146
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
147
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
148
+
149
+ # calculations for diffusion q(x_t | x_{t-1}) and others
150
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
151
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
152
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
153
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
154
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
155
+
156
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
157
+ posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
158
+ 1. - alphas_cumprod) + self.v_posterior * betas
159
+ # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
160
+ self.register_buffer('posterior_variance', to_torch(posterior_variance))
161
+ # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
162
+ self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
163
+ self.register_buffer('posterior_mean_coef1', to_torch(
164
+ betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
165
+ self.register_buffer('posterior_mean_coef2', to_torch(
166
+ (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
167
+
168
+ if self.parameterization == "eps":
169
+ lvlb_weights = self.betas ** 2 / (
170
+ 2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
171
+ elif self.parameterization == "x0":
172
+ lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
173
+ else:
174
+ raise NotImplementedError("mu not supported")
175
+ # TODO how to choose this term
176
+ lvlb_weights[0] = lvlb_weights[1]
177
+ self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
178
+ assert not torch.isnan(self.lvlb_weights).all()
179
+
180
+ @contextmanager
181
+ def ema_scope(self, context=None):
182
+ if self.use_ema:
183
+ self.model_ema.store(self.model.parameters())
184
+ self.model_ema.copy_to(self.model)
185
+ if context is not None:
186
+ print(f"{context}: Switched to EMA weights")
187
+ try:
188
+ yield None
189
+ finally:
190
+ if self.use_ema:
191
+ self.model_ema.restore(self.model.parameters())
192
+ if context is not None:
193
+ print(f"{context}: Restored training weights")
194
+
195
+ @torch.no_grad()
196
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
197
+ sd = torch.load(path, map_location="cpu")
198
+ if "state_dict" in list(sd.keys()):
199
+ sd = sd["state_dict"]
200
+ keys = list(sd.keys())
201
+
202
+ if self.make_it_fit:
203
+ n_params = len([name for name, _ in
204
+ itertools.chain(self.named_parameters(),
205
+ self.named_buffers())])
206
+ for name, param in tqdm(
207
+ itertools.chain(self.named_parameters(),
208
+ self.named_buffers()),
209
+ desc="Fitting old weights to new weights",
210
+ total=n_params
211
+ ):
212
+ if not name in sd:
213
+ continue
214
+ old_shape = sd[name].shape
215
+ new_shape = param.shape
216
+ assert len(old_shape)==len(new_shape)
217
+ if len(new_shape) > 2:
218
+ # we only modify first two axes
219
+ assert new_shape[2:] == old_shape[2:]
220
+ # assumes first axis corresponds to output dim
221
+ if not new_shape == old_shape:
222
+ new_param = param.clone()
223
+ old_param = sd[name]
224
+ if len(new_shape) == 1:
225
+ for i in range(new_param.shape[0]):
226
+ new_param[i] = old_param[i % old_shape[0]]
227
+ elif len(new_shape) >= 2:
228
+ for i in range(new_param.shape[0]):
229
+ for j in range(new_param.shape[1]):
230
+ new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
231
+
232
+ n_used_old = torch.ones(old_shape[1])
233
+ for j in range(new_param.shape[1]):
234
+ n_used_old[j % old_shape[1]] += 1
235
+ n_used_new = torch.zeros(new_shape[1])
236
+ for j in range(new_param.shape[1]):
237
+ n_used_new[j] = n_used_old[j % old_shape[1]]
238
+
239
+ n_used_new = n_used_new[None, :]
240
+ while len(n_used_new.shape) < len(new_shape):
241
+ n_used_new = n_used_new.unsqueeze(-1)
242
+ new_param /= n_used_new
243
+
244
+ sd[name] = new_param
245
+
246
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
247
+ sd, strict=False)
248
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
249
+ if len(missing) > 0:
250
+ print(f"Missing Keys: {missing}")
251
+ if len(unexpected) > 0:
252
+ print(f"Unexpected Keys: {unexpected}")
253
+
254
+ def q_mean_variance(self, x_start, t):
255
+ """
256
+ Get the distribution q(x_t | x_0).
257
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
258
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
259
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
260
+ """
261
+ mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
262
+ variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
263
+ log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
264
+ return mean, variance, log_variance
265
+
266
+ def predict_start_from_noise(self, x_t, t, noise):
267
+ return (
268
+ extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
269
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
270
+ )
271
+
272
+ def q_posterior(self, x_start, x_t, t):
273
+ posterior_mean = (
274
+ extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
275
+ extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
276
+ )
277
+ posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
278
+ posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
279
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
280
+
281
+ def p_mean_variance(self, x, t, clip_denoised: bool):
282
+ model_out = self.model(x, t)
283
+ if self.parameterization == "eps":
284
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
285
+ elif self.parameterization == "x0":
286
+ x_recon = model_out
287
+ if clip_denoised:
288
+ x_recon.clamp_(-1., 1.)
289
+
290
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
291
+ return model_mean, posterior_variance, posterior_log_variance
292
+
293
+ @torch.no_grad()
294
+ def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
295
+ b, *_, device = *x.shape, x.device
296
+ model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
297
+ noise = noise_like(x.shape, device, repeat_noise)
298
+ # no noise when t == 0
299
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
300
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
301
+
302
+ @torch.no_grad()
303
+ def p_sample_loop(self, shape, return_intermediates=False):
304
+ device = self.betas.device
305
+ b = shape[0]
306
+ img = torch.randn(shape, device=device)
307
+ intermediates = [img]
308
+ for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
309
+ img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
310
+ clip_denoised=self.clip_denoised)
311
+ if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
312
+ intermediates.append(img)
313
+ if return_intermediates:
314
+ return img, intermediates
315
+ return img
316
+
317
+ @torch.no_grad()
318
+ def sample(self, batch_size=16, return_intermediates=False):
319
+ image_size = self.image_size
320
+ channels = self.channels
321
+ return self.p_sample_loop((batch_size, channels, image_size, image_size),
322
+ return_intermediates=return_intermediates)
323
+
324
+ def q_sample(self, x_start, t, noise=None):
325
+ noise = default(noise, lambda: torch.randn_like(x_start))
326
+ return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
327
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
328
+
329
+ def get_loss(self, pred, target, mean=True):
330
+ if self.loss_type == 'l1':
331
+ loss = (target - pred).abs()
332
+ if mean:
333
+ loss = loss.mean()
334
+ elif self.loss_type == 'l2':
335
+ if mean:
336
+ loss = torch.nn.functional.mse_loss(target, pred)
337
+ else:
338
+ loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
339
+ else:
340
+ raise NotImplementedError("unknown loss type '{loss_type}'")
341
+
342
+ return loss
343
+
344
+ def p_losses(self, x_start, t, noise=None):
345
+ noise = default(noise, lambda: torch.randn_like(x_start))
346
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
347
+ model_out = self.model(x_noisy, t)
348
+
349
+ loss_dict = {}
350
+ if self.parameterization == "eps":
351
+ target = noise
352
+ elif self.parameterization == "x0":
353
+ target = x_start
354
+ else:
355
+ raise NotImplementedError(f"Paramterization {self.parameterization} not yet supported")
356
+
357
+ loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
358
+
359
+ log_prefix = 'train' if self.training else 'val'
360
+
361
+ loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
362
+ loss_simple = loss.mean() * self.l_simple_weight
363
+
364
+ loss_vlb = (self.lvlb_weights[t] * loss).mean()
365
+ loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
366
+
367
+ loss = loss_simple + self.original_elbo_weight * loss_vlb
368
+
369
+ loss_dict.update({f'{log_prefix}/loss': loss})
370
+
371
+ return loss, loss_dict
372
+
373
+ def forward(self, x, *args, **kwargs):
374
+ # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
375
+ # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
376
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
377
+ return self.p_losses(x, t, *args, **kwargs)
378
+
379
+ def get_input(self, batch, k):
380
+ x = batch[k]
381
+ if len(x.shape) == 3:
382
+ x = x[..., None]
383
+ x = rearrange(x, 'b h w c -> b c h w')
384
+ x = x.to(memory_format=torch.contiguous_format).float()
385
+ return x
386
+
387
+ def shared_step(self, batch):
388
+ x = self.get_input(batch, self.first_stage_key)
389
+ loss, loss_dict = self(x)
390
+ return loss, loss_dict
391
+
392
+ def training_step(self, batch, batch_idx):
393
+ for k in self.ucg_training:
394
+ p = self.ucg_training[k]["p"]
395
+ val = self.ucg_training[k]["val"]
396
+ if val is None:
397
+ val = ""
398
+ for i in range(len(batch[k])):
399
+ if self.ucg_prng.choice(2, p=[1-p, p]):
400
+ batch[k][i] = val
401
+
402
+ loss, loss_dict = self.shared_step(batch)
403
+
404
+ self.log_dict(loss_dict, prog_bar=True,
405
+ logger=True, on_step=True, on_epoch=True)
406
+
407
+ self.log("global_step", self.global_step,
408
+ prog_bar=True, logger=True, on_step=True, on_epoch=False)
409
+
410
+ if self.use_scheduler:
411
+ lr = self.optimizers().param_groups[0]['lr']
412
+ self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
413
+
414
+ return loss
415
+
416
+ @torch.no_grad()
417
+ def validation_step(self, batch, batch_idx):
418
+ _, loss_dict_no_ema = self.shared_step(batch)
419
+ with self.ema_scope():
420
+ _, loss_dict_ema = self.shared_step(batch)
421
+ loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
422
+ self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
423
+ self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
424
+
425
+ def on_train_batch_end(self, *args, **kwargs):
426
+ if self.use_ema:
427
+ self.model_ema(self.model)
428
+
429
+ def _get_rows_from_list(self, samples):
430
+ n_imgs_per_row = len(samples)
431
+ denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
432
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
433
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
434
+ return denoise_grid
435
+
436
+ @torch.no_grad()
437
+ def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
438
+ log = dict()
439
+ x = self.get_input(batch, self.first_stage_key)
440
+ N = min(x.shape[0], N)
441
+ n_row = min(x.shape[0], n_row)
442
+ x = x.to(self.device)[:N]
443
+ log["inputs"] = x
444
+
445
+ # get diffusion row
446
+ diffusion_row = list()
447
+ x_start = x[:n_row]
448
+
449
+ for t in range(self.num_timesteps):
450
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
451
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
452
+ t = t.to(self.device).long()
453
+ noise = torch.randn_like(x_start)
454
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
455
+ diffusion_row.append(x_noisy)
456
+
457
+ log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
458
+
459
+ if sample:
460
+ # get denoise row
461
+ with self.ema_scope("Plotting"):
462
+ samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
463
+
464
+ log["samples"] = samples
465
+ log["denoise_row"] = self._get_rows_from_list(denoise_row)
466
+
467
+ if return_keys:
468
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
469
+ return log
470
+ else:
471
+ return {key: log[key] for key in return_keys}
472
+ return log
473
+
474
+ def configure_optimizers(self):
475
+ lr = self.learning_rate
476
+ params = list(self.model.parameters())
477
+ if self.learn_logvar:
478
+ params = params + [self.logvar]
479
+ opt = torch.optim.AdamW(params, lr=lr)
480
+ return opt
481
+
482
+
483
+ class LatentDiffusion(DDPM):
484
+ """main class"""
485
+ def __init__(self,
486
+ first_stage_config,
487
+ cond_stage_config,
488
+ num_timesteps_cond=None,
489
+ cond_stage_key="image",
490
+ cond_stage_trainable=False,
491
+ concat_mode=True,
492
+ cond_stage_forward=None,
493
+ conditioning_key=None,
494
+ scale_factor=1.0,
495
+ scale_by_std=False,
496
+ unet_trainable=True,
497
+ *args, **kwargs):
498
+ self.num_timesteps_cond = default(num_timesteps_cond, 1)
499
+ self.scale_by_std = scale_by_std
500
+ assert self.num_timesteps_cond <= kwargs['timesteps']
501
+ # for backwards compatibility after implementation of DiffusionWrapper
502
+ if conditioning_key is None:
503
+ conditioning_key = 'concat' if concat_mode else 'crossattn'
504
+ if cond_stage_config == '__is_unconditional__':
505
+ conditioning_key = None
506
+ ckpt_path = kwargs.pop("ckpt_path", None)
507
+ ignore_keys = kwargs.pop("ignore_keys", [])
508
+ super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
509
+ self.concat_mode = concat_mode
510
+ self.cond_stage_trainable = cond_stage_trainable
511
+ self.unet_trainable = unet_trainable
512
+ self.cond_stage_key = cond_stage_key
513
+ try:
514
+ self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
515
+ except:
516
+ self.num_downs = 0
517
+ if not scale_by_std:
518
+ self.scale_factor = scale_factor
519
+ else:
520
+ self.register_buffer('scale_factor', torch.tensor(scale_factor))
521
+ self.instantiate_first_stage(first_stage_config)
522
+ self.instantiate_cond_stage(cond_stage_config)
523
+ self.cond_stage_forward = cond_stage_forward
524
+
525
+ # construct linear projection layer for concatenating image CLIP embedding and RT
526
+ self.cc_projection = nn.Linear(772, 768)
527
+ nn.init.eye_(list(self.cc_projection.parameters())[0][:768, :768])
528
+ nn.init.zeros_(list(self.cc_projection.parameters())[1])
529
+ self.cc_projection.requires_grad_(True)
530
+
531
+ self.clip_denoised = False
532
+ self.bbox_tokenizer = None
533
+
534
+ self.restarted_from_ckpt = False
535
+ if ckpt_path is not None:
536
+ self.init_from_ckpt(ckpt_path, ignore_keys)
537
+ self.restarted_from_ckpt = True
538
+
539
+ def make_cond_schedule(self, ):
540
+ self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
541
+ ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
542
+ self.cond_ids[:self.num_timesteps_cond] = ids
543
+
544
+ @rank_zero_only
545
+ @torch.no_grad()
546
+ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
547
+ # only for very first batch
548
+ if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
549
+ assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
550
+ # set rescale weight to 1./std of encodings
551
+ print("### USING STD-RESCALING ###")
552
+ x = super().get_input(batch, self.first_stage_key)
553
+ x = x.to(self.device)
554
+ encoder_posterior = self.encode_first_stage(x)
555
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
556
+ del self.scale_factor
557
+ self.register_buffer('scale_factor', 1. / z.flatten().std())
558
+ print(f"setting self.scale_factor to {self.scale_factor}")
559
+ print("### USING STD-RESCALING ###")
560
+
561
+ def register_schedule(self,
562
+ given_betas=None, beta_schedule="linear", timesteps=1000,
563
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
564
+ super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
565
+
566
+ self.shorten_cond_schedule = self.num_timesteps_cond > 1
567
+ if self.shorten_cond_schedule:
568
+ self.make_cond_schedule()
569
+
570
+ def instantiate_first_stage(self, config):
571
+ model = instantiate_from_config(config)
572
+ self.first_stage_model = model.eval()
573
+ self.first_stage_model.train = disabled_train
574
+ for param in self.first_stage_model.parameters():
575
+ param.requires_grad = False
576
+
577
+ def instantiate_cond_stage(self, config):
578
+ if not self.cond_stage_trainable:
579
+ if config == "__is_first_stage__":
580
+ print("Using first stage also as cond stage.")
581
+ self.cond_stage_model = self.first_stage_model
582
+ elif config == "__is_unconditional__":
583
+ print(f"Training {self.__class__.__name__} as an unconditional model.")
584
+ self.cond_stage_model = None
585
+ # self.be_unconditional = True
586
+ else:
587
+ model = instantiate_from_config(config)
588
+ self.cond_stage_model = model.eval()
589
+ self.cond_stage_model.train = disabled_train
590
+ for param in self.cond_stage_model.parameters():
591
+ param.requires_grad = False
592
+ else:
593
+ assert config != '__is_first_stage__'
594
+ assert config != '__is_unconditional__'
595
+ model = instantiate_from_config(config)
596
+ self.cond_stage_model = model
597
+
598
+ def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
599
+ denoise_row = []
600
+ for zd in tqdm(samples, desc=desc):
601
+ denoise_row.append(self.decode_first_stage(zd.to(self.device),
602
+ force_not_quantize=force_no_decoder_quantization))
603
+ n_imgs_per_row = len(denoise_row)
604
+ denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W
605
+ denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
606
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
607
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
608
+ return denoise_grid
609
+
610
+ def get_first_stage_encoding(self, encoder_posterior):
611
+ if isinstance(encoder_posterior, DiagonalGaussianDistribution):
612
+ z = encoder_posterior.sample()
613
+ elif isinstance(encoder_posterior, torch.Tensor):
614
+ z = encoder_posterior
615
+ else:
616
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
617
+ return self.scale_factor * z
618
+
619
+ def get_learned_conditioning(self, c):
620
+ if self.cond_stage_forward is None:
621
+ if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
622
+ c = self.cond_stage_model.encode(c)
623
+ if isinstance(c, DiagonalGaussianDistribution):
624
+ c = c.mode()
625
+ else:
626
+ c = self.cond_stage_model(c)
627
+ else:
628
+ assert hasattr(self.cond_stage_model, self.cond_stage_forward)
629
+ c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
630
+ return c
631
+
632
+ def meshgrid(self, h, w):
633
+ y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
634
+ x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
635
+
636
+ arr = torch.cat([y, x], dim=-1)
637
+ return arr
638
+
639
+ def delta_border(self, h, w):
640
+ """
641
+ :param h: height
642
+ :param w: width
643
+ :return: normalized distance to image border,
644
+ wtith min distance = 0 at border and max dist = 0.5 at image center
645
+ """
646
+ lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
647
+ arr = self.meshgrid(h, w) / lower_right_corner
648
+ dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
649
+ dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
650
+ edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
651
+ return edge_dist
652
+
653
+ def get_weighting(self, h, w, Ly, Lx, device):
654
+ weighting = self.delta_border(h, w)
655
+ weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
656
+ self.split_input_params["clip_max_weight"], )
657
+ weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
658
+
659
+ if self.split_input_params["tie_braker"]:
660
+ L_weighting = self.delta_border(Ly, Lx)
661
+ L_weighting = torch.clip(L_weighting,
662
+ self.split_input_params["clip_min_tie_weight"],
663
+ self.split_input_params["clip_max_tie_weight"])
664
+
665
+ L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
666
+ weighting = weighting * L_weighting
667
+ return weighting
668
+
669
+ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo load once not every time, shorten code
670
+ """
671
+ :param x: img of size (bs, c, h, w)
672
+ :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
673
+ """
674
+ bs, nc, h, w = x.shape
675
+
676
+ # number of crops in image
677
+ Ly = (h - kernel_size[0]) // stride[0] + 1
678
+ Lx = (w - kernel_size[1]) // stride[1] + 1
679
+
680
+ if uf == 1 and df == 1:
681
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
682
+ unfold = torch.nn.Unfold(**fold_params)
683
+
684
+ fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
685
+
686
+ weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
687
+ normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap
688
+ weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
689
+
690
+ elif uf > 1 and df == 1:
691
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
692
+ unfold = torch.nn.Unfold(**fold_params)
693
+
694
+ fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
695
+ dilation=1, padding=0,
696
+ stride=(stride[0] * uf, stride[1] * uf))
697
+ fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
698
+
699
+ weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
700
+ normalization = fold(weighting).view(1, 1, h * uf, w * uf) # normalizes the overlap
701
+ weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
702
+
703
+ elif df > 1 and uf == 1:
704
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
705
+ unfold = torch.nn.Unfold(**fold_params)
706
+
707
+ fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
708
+ dilation=1, padding=0,
709
+ stride=(stride[0] // df, stride[1] // df))
710
+ fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
711
+
712
+ weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
713
+ normalization = fold(weighting).view(1, 1, h // df, w // df) # normalizes the overlap
714
+ weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
715
+
716
+ else:
717
+ raise NotImplementedError
718
+
719
+ return fold, unfold, normalization, weighting
720
+
721
+
722
+ @torch.no_grad()
723
+ def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
724
+ cond_key=None, return_original_cond=False, bs=None, uncond=0.05):
725
+ x = super().get_input(batch, k)
726
+ T = batch['T'].to(memory_format=torch.contiguous_format).float()
727
+
728
+ if bs is not None:
729
+ x = x[:bs]
730
+ T = T[:bs].to(self.device)
731
+
732
+ x = x.to(self.device)
733
+ encoder_posterior = self.encode_first_stage(x)
734
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
735
+ cond_key = cond_key or self.cond_stage_key
736
+ xc = super().get_input(batch, cond_key).to(self.device)
737
+ if bs is not None:
738
+ xc = xc[:bs]
739
+ cond = {}
740
+
741
+ # To support classifier-free guidance, randomly drop out only text conditioning 5%, only image conditioning 5%, and both 5%.
742
+ random = torch.rand(x.size(0), device=x.device)
743
+ prompt_mask = rearrange(random < 2 * uncond, "n -> n 1 1")
744
+ input_mask = 1 - rearrange((random >= uncond).float() * (random < 3 * uncond).float(), "n -> n 1 1 1")
745
+ null_prompt = self.get_learned_conditioning([""])
746
+
747
+ # z.shape: [8, 4, 64, 64]; c.shape: [8, 1, 768]
748
+ # print('=========== xc shape ===========', xc.shape)
749
+ with torch.enable_grad():
750
+ clip_emb = self.get_learned_conditioning(xc).detach()
751
+ null_prompt = self.get_learned_conditioning([""]).detach()
752
+ cond["c_crossattn"] = [self.cc_projection(torch.cat([torch.where(prompt_mask, null_prompt, clip_emb), T[:, None, :]], dim=-1))]
753
+ cond["c_concat"] = [input_mask * self.encode_first_stage((xc.to(self.device))).mode().detach()]
754
+ out = [z, cond]
755
+ if return_first_stage_outputs:
756
+ xrec = self.decode_first_stage(z)
757
+ out.extend([x, xrec])
758
+ if return_original_cond:
759
+ out.append(xc)
760
+ return out
761
+
762
+ # @torch.no_grad()
763
+ def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
764
+ if predict_cids:
765
+ if z.dim() == 4:
766
+ z = torch.argmax(z.exp(), dim=1).long()
767
+ z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
768
+ z = rearrange(z, 'b h w c -> b c h w').contiguous()
769
+
770
+ z = 1. / self.scale_factor * z
771
+
772
+ if hasattr(self, "split_input_params"):
773
+ if self.split_input_params["patch_distributed_vq"]:
774
+ ks = self.split_input_params["ks"] # eg. (128, 128)
775
+ stride = self.split_input_params["stride"] # eg. (64, 64)
776
+ uf = self.split_input_params["vqf"]
777
+ bs, nc, h, w = z.shape
778
+ if ks[0] > h or ks[1] > w:
779
+ ks = (min(ks[0], h), min(ks[1], w))
780
+ print("reducing Kernel")
781
+
782
+ if stride[0] > h or stride[1] > w:
783
+ stride = (min(stride[0], h), min(stride[1], w))
784
+ print("reducing stride")
785
+
786
+ fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=uf)
787
+
788
+ z = unfold(z) # (bn, nc * prod(**ks), L)
789
+ # 1. Reshape to img shape
790
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
791
+
792
+ # 2. apply model loop over last dim
793
+ if isinstance(self.first_stage_model, VQModelInterface):
794
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i],
795
+ force_not_quantize=predict_cids or force_not_quantize)
796
+ for i in range(z.shape[-1])]
797
+ else:
798
+
799
+ output_list = [self.first_stage_model.decode(z[:, :, :, :, i])
800
+ for i in range(z.shape[-1])]
801
+
802
+ o = torch.stack(output_list, axis=-1) # # (bn, nc, ks[0], ks[1], L)
803
+ o = o * weighting
804
+ # Reverse 1. reshape to img shape
805
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
806
+ # stitch crops together
807
+ decoded = fold(o)
808
+ decoded = decoded / normalization # norm is shape (1, 1, h, w)
809
+ return decoded
810
+ else:
811
+ if isinstance(self.first_stage_model, VQModelInterface):
812
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
813
+ else:
814
+ return self.first_stage_model.decode(z)
815
+
816
+ else:
817
+ if isinstance(self.first_stage_model, VQModelInterface):
818
+ return self.first_stage_model.decode(z, force_not_quantize=predict_cids or force_not_quantize)
819
+ else:
820
+ return self.first_stage_model.decode(z)
821
+
822
+ @torch.no_grad()
823
+ def encode_first_stage(self, x):
824
+ if hasattr(self, "split_input_params"):
825
+ if self.split_input_params["patch_distributed_vq"]:
826
+ ks = self.split_input_params["ks"] # eg. (128, 128)
827
+ stride = self.split_input_params["stride"] # eg. (64, 64)
828
+ df = self.split_input_params["vqf"]
829
+ self.split_input_params['original_image_size'] = x.shape[-2:]
830
+ bs, nc, h, w = x.shape
831
+ if ks[0] > h or ks[1] > w:
832
+ ks = (min(ks[0], h), min(ks[1], w))
833
+ print("reducing Kernel")
834
+
835
+ if stride[0] > h or stride[1] > w:
836
+ stride = (min(stride[0], h), min(stride[1], w))
837
+ print("reducing stride")
838
+
839
+ fold, unfold, normalization, weighting = self.get_fold_unfold(x, ks, stride, df=df)
840
+ z = unfold(x) # (bn, nc * prod(**ks), L)
841
+ # Reshape to img shape
842
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
843
+
844
+ output_list = [self.first_stage_model.encode(z[:, :, :, :, i])
845
+ for i in range(z.shape[-1])]
846
+
847
+ o = torch.stack(output_list, axis=-1)
848
+ o = o * weighting
849
+
850
+ # Reverse reshape to img shape
851
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
852
+ # stitch crops together
853
+ decoded = fold(o)
854
+ decoded = decoded / normalization
855
+ return decoded
856
+
857
+ else:
858
+ return self.first_stage_model.encode(x)
859
+ else:
860
+ return self.first_stage_model.encode(x)
861
+
862
+ def shared_step(self, batch, **kwargs):
863
+ x, c = self.get_input(batch, self.first_stage_key)
864
+ loss = self(x, c)
865
+ return loss
866
+
867
+ def forward(self, x, c, *args, **kwargs):
868
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
869
+ if self.model.conditioning_key is not None:
870
+ assert c is not None
871
+ # if self.cond_stage_trainable:
872
+ # c = self.get_learned_conditioning(c)
873
+ if self.shorten_cond_schedule: # TODO: drop this option
874
+ tc = self.cond_ids[t].to(self.device)
875
+ c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
876
+ return self.p_losses(x, c, t, *args, **kwargs)
877
+
878
+ def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: move to dataset
879
+ def rescale_bbox(bbox):
880
+ x0 = clamp((bbox[0] - crop_coordinates[0]) / crop_coordinates[2])
881
+ y0 = clamp((bbox[1] - crop_coordinates[1]) / crop_coordinates[3])
882
+ w = min(bbox[2] / crop_coordinates[2], 1 - x0)
883
+ h = min(bbox[3] / crop_coordinates[3], 1 - y0)
884
+ return x0, y0, w, h
885
+
886
+ return [rescale_bbox(b) for b in bboxes]
887
+
888
+ def apply_model(self, x_noisy, t, cond, return_ids=False):
889
+
890
+ if isinstance(cond, dict):
891
+ # hybrid case, cond is exptected to be a dict
892
+ pass
893
+ else:
894
+ if not isinstance(cond, list):
895
+ cond = [cond]
896
+ key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
897
+ cond = {key: cond}
898
+
899
+ if hasattr(self, "split_input_params"):
900
+ assert len(cond) == 1 # todo can only deal with one conditioning atm
901
+ assert not return_ids
902
+ ks = self.split_input_params["ks"] # eg. (128, 128)
903
+ stride = self.split_input_params["stride"] # eg. (64, 64)
904
+
905
+ h, w = x_noisy.shape[-2:]
906
+
907
+ fold, unfold, normalization, weighting = self.get_fold_unfold(x_noisy, ks, stride)
908
+
909
+ z = unfold(x_noisy) # (bn, nc * prod(**ks), L)
910
+ # Reshape to img shape
911
+ z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
912
+ z_list = [z[:, :, :, :, i] for i in range(z.shape[-1])]
913
+
914
+ if self.cond_stage_key in ["image", "LR_image", "segmentation",
915
+ 'bbox_img'] and self.model.conditioning_key: # todo check for completeness
916
+ c_key = next(iter(cond.keys())) # get key
917
+ c = next(iter(cond.values())) # get value
918
+ assert (len(c) == 1) # todo extend to list with more than one elem
919
+ c = c[0] # get element
920
+
921
+ c = unfold(c)
922
+ c = c.view((c.shape[0], -1, ks[0], ks[1], c.shape[-1])) # (bn, nc, ks[0], ks[1], L )
923
+
924
+ cond_list = [{c_key: [c[:, :, :, :, i]]} for i in range(c.shape[-1])]
925
+
926
+ elif self.cond_stage_key == 'coordinates_bbox':
927
+ assert 'original_image_size' in self.split_input_params, 'BoudingBoxRescaling is missing original_image_size'
928
+
929
+ # assuming padding of unfold is always 0 and its dilation is always 1
930
+ n_patches_per_row = int((w - ks[0]) / stride[0] + 1)
931
+ full_img_h, full_img_w = self.split_input_params['original_image_size']
932
+ # as we are operating on latents, we need the factor from the original image size to the
933
+ # spatial latent size to properly rescale the crops for regenerating the bbox annotations
934
+ num_downs = self.first_stage_model.encoder.num_resolutions - 1
935
+ rescale_latent = 2 ** (num_downs)
936
+
937
+ # get top left postions of patches as conforming for the bbbox tokenizer, therefore we
938
+ # need to rescale the tl patch coordinates to be in between (0,1)
939
+ tl_patch_coordinates = [(rescale_latent * stride[0] * (patch_nr % n_patches_per_row) / full_img_w,
940
+ rescale_latent * stride[1] * (patch_nr // n_patches_per_row) / full_img_h)
941
+ for patch_nr in range(z.shape[-1])]
942
+
943
+ # patch_limits are tl_coord, width and height coordinates as (x_tl, y_tl, h, w)
944
+ patch_limits = [(x_tl, y_tl,
945
+ rescale_latent * ks[0] / full_img_w,
946
+ rescale_latent * ks[1] / full_img_h) for x_tl, y_tl in tl_patch_coordinates]
947
+ # patch_values = [(np.arange(x_tl,min(x_tl+ks, 1.)),np.arange(y_tl,min(y_tl+ks, 1.))) for x_tl, y_tl in tl_patch_coordinates]
948
+
949
+ # tokenize crop coordinates for the bounding boxes of the respective patches
950
+ patch_limits_tknzd = [torch.LongTensor(self.bbox_tokenizer._crop_encoder(bbox))[None].to(self.device)
951
+ for bbox in patch_limits] # list of length l with tensors of shape (1, 2)
952
+ # cut tknzd crop position from conditioning
953
+ assert isinstance(cond, dict), 'cond must be dict to be fed into model'
954
+ cut_cond = cond['c_crossattn'][0][..., :-2].to(self.device)
955
+
956
+ adapted_cond = torch.stack([torch.cat([cut_cond, p], dim=1) for p in patch_limits_tknzd])
957
+ adapted_cond = rearrange(adapted_cond, 'l b n -> (l b) n')
958
+ adapted_cond = self.get_learned_conditioning(adapted_cond)
959
+ adapted_cond = rearrange(adapted_cond, '(l b) n d -> l b n d', l=z.shape[-1])
960
+
961
+ cond_list = [{'c_crossattn': [e]} for e in adapted_cond]
962
+
963
+ else:
964
+ cond_list = [cond for i in range(z.shape[-1])] # Todo make this more efficient
965
+
966
+ # apply model by loop over crops
967
+ output_list = [self.model(z_list[i], t, **cond_list[i]) for i in range(z.shape[-1])]
968
+ assert not isinstance(output_list[0],
969
+ tuple) # todo cant deal with multiple model outputs check this never happens
970
+
971
+ o = torch.stack(output_list, axis=-1)
972
+ o = o * weighting
973
+ # Reverse reshape to img shape
974
+ o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
975
+ # stitch crops together
976
+ x_recon = fold(o) / normalization
977
+
978
+ else:
979
+ x_recon = self.model(x_noisy, t, **cond)
980
+
981
+ if isinstance(x_recon, tuple) and not return_ids:
982
+ return x_recon[0]
983
+ else:
984
+ return x_recon
985
+
986
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
987
+ return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
988
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
989
+
990
+ def _prior_bpd(self, x_start):
991
+ """
992
+ Get the prior KL term for the variational lower-bound, measured in
993
+ bits-per-dim.
994
+ This term can't be optimized, as it only depends on the encoder.
995
+ :param x_start: the [N x C x ...] tensor of inputs.
996
+ :return: a batch of [N] KL values (in bits), one per batch element.
997
+ """
998
+ batch_size = x_start.shape[0]
999
+ t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
1000
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
1001
+ kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
1002
+ return mean_flat(kl_prior) / np.log(2.0)
1003
+
1004
+ def p_losses(self, x_start, cond, t, noise=None):
1005
+ noise = default(noise, lambda: torch.randn_like(x_start))
1006
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
1007
+ model_output = self.apply_model(x_noisy, t, cond)
1008
+
1009
+ loss_dict = {}
1010
+ prefix = 'train' if self.training else 'val'
1011
+
1012
+ if self.parameterization == "x0":
1013
+ target = x_start
1014
+ elif self.parameterization == "eps":
1015
+ target = noise
1016
+ else:
1017
+ raise NotImplementedError()
1018
+
1019
+ loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
1020
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
1021
+
1022
+ logvar_t = self.logvar[t].to(self.device)
1023
+ loss = loss_simple / torch.exp(logvar_t) + logvar_t
1024
+ # loss = loss_simple / torch.exp(self.logvar) + self.logvar
1025
+ if self.learn_logvar:
1026
+ loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
1027
+ loss_dict.update({'logvar': self.logvar.data.mean()})
1028
+
1029
+ loss = self.l_simple_weight * loss.mean()
1030
+
1031
+ loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
1032
+ loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
1033
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
1034
+ loss += (self.original_elbo_weight * loss_vlb)
1035
+ loss_dict.update({f'{prefix}/loss': loss})
1036
+
1037
+ return loss, loss_dict
1038
+
1039
+ def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
1040
+ return_x0=False, score_corrector=None, corrector_kwargs=None):
1041
+ t_in = t
1042
+ model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
1043
+
1044
+ if score_corrector is not None:
1045
+ assert self.parameterization == "eps"
1046
+ model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
1047
+
1048
+ if return_codebook_ids:
1049
+ model_out, logits = model_out
1050
+
1051
+ if self.parameterization == "eps":
1052
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
1053
+ elif self.parameterization == "x0":
1054
+ x_recon = model_out
1055
+ else:
1056
+ raise NotImplementedError()
1057
+
1058
+ if clip_denoised:
1059
+ x_recon.clamp_(-1., 1.)
1060
+ if quantize_denoised:
1061
+ x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
1062
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
1063
+ if return_codebook_ids:
1064
+ return model_mean, posterior_variance, posterior_log_variance, logits
1065
+ elif return_x0:
1066
+ return model_mean, posterior_variance, posterior_log_variance, x_recon
1067
+ else:
1068
+ return model_mean, posterior_variance, posterior_log_variance
1069
+
1070
+ @torch.no_grad()
1071
+ def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
1072
+ return_codebook_ids=False, quantize_denoised=False, return_x0=False,
1073
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
1074
+ b, *_, device = *x.shape, x.device
1075
+ outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
1076
+ return_codebook_ids=return_codebook_ids,
1077
+ quantize_denoised=quantize_denoised,
1078
+ return_x0=return_x0,
1079
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
1080
+ if return_codebook_ids:
1081
+ raise DeprecationWarning("Support dropped.")
1082
+ model_mean, _, model_log_variance, logits = outputs
1083
+ elif return_x0:
1084
+ model_mean, _, model_log_variance, x0 = outputs
1085
+ else:
1086
+ model_mean, _, model_log_variance = outputs
1087
+
1088
+ noise = noise_like(x.shape, device, repeat_noise) * temperature
1089
+ if noise_dropout > 0.:
1090
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
1091
+ # no noise when t == 0
1092
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
1093
+
1094
+ if return_codebook_ids:
1095
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
1096
+ if return_x0:
1097
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
1098
+ else:
1099
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
1100
+
1101
+ @torch.no_grad()
1102
+ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
1103
+ img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
1104
+ score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
1105
+ log_every_t=None):
1106
+ if not log_every_t:
1107
+ log_every_t = self.log_every_t
1108
+ timesteps = self.num_timesteps
1109
+ if batch_size is not None:
1110
+ b = batch_size if batch_size is not None else shape[0]
1111
+ shape = [batch_size] + list(shape)
1112
+ else:
1113
+ b = batch_size = shape[0]
1114
+ if x_T is None:
1115
+ img = torch.randn(shape, device=self.device)
1116
+ else:
1117
+ img = x_T
1118
+ intermediates = []
1119
+ if cond is not None:
1120
+ if isinstance(cond, dict):
1121
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1122
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1123
+ else:
1124
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1125
+
1126
+ if start_T is not None:
1127
+ timesteps = min(timesteps, start_T)
1128
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
1129
+ total=timesteps) if verbose else reversed(
1130
+ range(0, timesteps))
1131
+ if type(temperature) == float:
1132
+ temperature = [temperature] * timesteps
1133
+
1134
+ for i in iterator:
1135
+ ts = torch.full((b,), i, device=self.device, dtype=torch.long)
1136
+ if self.shorten_cond_schedule:
1137
+ assert self.model.conditioning_key != 'hybrid'
1138
+ tc = self.cond_ids[ts].to(cond.device)
1139
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1140
+
1141
+ img, x0_partial = self.p_sample(img, cond, ts,
1142
+ clip_denoised=self.clip_denoised,
1143
+ quantize_denoised=quantize_denoised, return_x0=True,
1144
+ temperature=temperature[i], noise_dropout=noise_dropout,
1145
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
1146
+ if mask is not None:
1147
+ assert x0 is not None
1148
+ img_orig = self.q_sample(x0, ts)
1149
+ img = img_orig * mask + (1. - mask) * img
1150
+
1151
+ if i % log_every_t == 0 or i == timesteps - 1:
1152
+ intermediates.append(x0_partial)
1153
+ if callback: callback(i)
1154
+ if img_callback: img_callback(img, i)
1155
+ return img, intermediates
1156
+
1157
+ @torch.no_grad()
1158
+ def p_sample_loop(self, cond, shape, return_intermediates=False,
1159
+ x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
1160
+ mask=None, x0=None, img_callback=None, start_T=None,
1161
+ log_every_t=None):
1162
+
1163
+ if not log_every_t:
1164
+ log_every_t = self.log_every_t
1165
+ device = self.betas.device
1166
+ b = shape[0]
1167
+ if x_T is None:
1168
+ img = torch.randn(shape, device=device)
1169
+ else:
1170
+ img = x_T
1171
+
1172
+ intermediates = [img]
1173
+ if timesteps is None:
1174
+ timesteps = self.num_timesteps
1175
+
1176
+ if start_T is not None:
1177
+ timesteps = min(timesteps, start_T)
1178
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
1179
+ range(0, timesteps))
1180
+
1181
+ if mask is not None:
1182
+ assert x0 is not None
1183
+ assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match
1184
+
1185
+ for i in iterator:
1186
+ ts = torch.full((b,), i, device=device, dtype=torch.long)
1187
+ if self.shorten_cond_schedule:
1188
+ assert self.model.conditioning_key != 'hybrid'
1189
+ tc = self.cond_ids[ts].to(cond.device)
1190
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1191
+
1192
+ img = self.p_sample(img, cond, ts,
1193
+ clip_denoised=self.clip_denoised,
1194
+ quantize_denoised=quantize_denoised)
1195
+ if mask is not None:
1196
+ img_orig = self.q_sample(x0, ts)
1197
+ img = img_orig * mask + (1. - mask) * img
1198
+
1199
+ if i % log_every_t == 0 or i == timesteps - 1:
1200
+ intermediates.append(img)
1201
+ if callback: callback(i)
1202
+ if img_callback: img_callback(img, i)
1203
+
1204
+ if return_intermediates:
1205
+ return img, intermediates
1206
+ return img
1207
+
1208
+ @torch.no_grad()
1209
+ def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
1210
+ verbose=True, timesteps=None, quantize_denoised=False,
1211
+ mask=None, x0=None, shape=None,**kwargs):
1212
+ if shape is None:
1213
+ shape = (batch_size, self.channels, self.image_size, self.image_size)
1214
+ if cond is not None:
1215
+ if isinstance(cond, dict):
1216
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1217
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1218
+ else:
1219
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1220
+ return self.p_sample_loop(cond,
1221
+ shape,
1222
+ return_intermediates=return_intermediates, x_T=x_T,
1223
+ verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
1224
+ mask=mask, x0=x0)
1225
+
1226
+ @torch.no_grad()
1227
+ def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
1228
+ if ddim:
1229
+ ddim_sampler = DDIMSampler(self)
1230
+ shape = (self.channels, self.image_size, self.image_size)
1231
+ samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
1232
+ shape, cond, verbose=False, **kwargs)
1233
+
1234
+ else:
1235
+ samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
1236
+ return_intermediates=True, **kwargs)
1237
+
1238
+ return samples, intermediates
1239
+
1240
+ @torch.no_grad()
1241
+ def get_unconditional_conditioning(self, batch_size, null_label=None, image_size=512):
1242
+ if null_label is not None:
1243
+ xc = null_label
1244
+ if isinstance(xc, ListConfig):
1245
+ xc = list(xc)
1246
+ if isinstance(xc, dict) or isinstance(xc, list):
1247
+ c = self.get_learned_conditioning(xc)
1248
+ else:
1249
+ if hasattr(xc, "to"):
1250
+ xc = xc.to(self.device)
1251
+ c = self.get_learned_conditioning(xc)
1252
+ else:
1253
+ # todo: get null label from cond_stage_model
1254
+ raise NotImplementedError()
1255
+ c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
1256
+ cond = {}
1257
+ cond["c_crossattn"] = [c]
1258
+ cond["c_concat"] = [torch.zeros([batch_size, 4, image_size // 8, image_size // 8]).to(self.device)]
1259
+ return cond
1260
+
1261
+ @torch.no_grad()
1262
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1263
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
1264
+ plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
1265
+ use_ema_scope=True,
1266
+ **kwargs):
1267
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1268
+ use_ddim = ddim_steps is not None
1269
+
1270
+ log = dict()
1271
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
1272
+ return_first_stage_outputs=True,
1273
+ force_c_encode=True,
1274
+ return_original_cond=True,
1275
+ bs=N)
1276
+ N = min(x.shape[0], N)
1277
+ n_row = min(x.shape[0], n_row)
1278
+ log["inputs"] = x
1279
+ log["reconstruction"] = xrec
1280
+ if self.model.conditioning_key is not None:
1281
+ if hasattr(self.cond_stage_model, "decode"):
1282
+ xc = self.cond_stage_model.decode(c)
1283
+ log["conditioning"] = xc
1284
+ elif self.cond_stage_key in ["caption", "txt"]:
1285
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2]//25)
1286
+ log["conditioning"] = xc
1287
+ elif self.cond_stage_key == 'class_label':
1288
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2]//25)
1289
+ log['conditioning'] = xc
1290
+ elif isimage(xc):
1291
+ log["conditioning"] = xc
1292
+ if ismap(xc):
1293
+ log["original_conditioning"] = self.to_rgb(xc)
1294
+
1295
+ if plot_diffusion_rows:
1296
+ # get diffusion row
1297
+ diffusion_row = list()
1298
+ z_start = z[:n_row]
1299
+ for t in range(self.num_timesteps):
1300
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1301
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1302
+ t = t.to(self.device).long()
1303
+ noise = torch.randn_like(z_start)
1304
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1305
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1306
+
1307
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1308
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1309
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1310
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1311
+ log["diffusion_row"] = diffusion_grid
1312
+
1313
+ if sample:
1314
+ # get denoise row
1315
+ with ema_scope("Sampling"):
1316
+ samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
1317
+ ddim_steps=ddim_steps,eta=ddim_eta)
1318
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1319
+ x_samples = self.decode_first_stage(samples)
1320
+ log["samples"] = x_samples
1321
+ if plot_denoise_rows:
1322
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1323
+ log["denoise_row"] = denoise_grid
1324
+
1325
+ if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
1326
+ self.first_stage_model, IdentityFirstStage):
1327
+ # also display when quantizing x0 while sampling
1328
+ with ema_scope("Plotting Quantized Denoised"):
1329
+ samples, z_denoise_row = self.sample_log(cond=c,batch_size=N,ddim=use_ddim,
1330
+ ddim_steps=ddim_steps,eta=ddim_eta,
1331
+ quantize_denoised=True)
1332
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
1333
+ # quantize_denoised=True)
1334
+ x_samples = self.decode_first_stage(samples.to(self.device))
1335
+ log["samples_x0_quantized"] = x_samples
1336
+
1337
+ if unconditional_guidance_scale > 1.0:
1338
+ uc = self.get_unconditional_conditioning(N, unconditional_guidance_label, image_size=x.shape[-1])
1339
+ # uc = torch.zeros_like(c)
1340
+ with ema_scope("Sampling with classifier-free guidance"):
1341
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1342
+ ddim_steps=ddim_steps, eta=ddim_eta,
1343
+ unconditional_guidance_scale=unconditional_guidance_scale,
1344
+ unconditional_conditioning=uc,
1345
+ )
1346
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1347
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1348
+
1349
+ if inpaint:
1350
+ # make a simple center square
1351
+ b, h, w = z.shape[0], z.shape[2], z.shape[3]
1352
+ mask = torch.ones(N, h, w).to(self.device)
1353
+ # zeros will be filled in
1354
+ mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
1355
+ mask = mask[:, None, ...]
1356
+ with ema_scope("Plotting Inpaint"):
1357
+
1358
+ samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
1359
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1360
+ x_samples = self.decode_first_stage(samples.to(self.device))
1361
+ log["samples_inpainting"] = x_samples
1362
+ log["mask"] = mask
1363
+
1364
+ # outpaint
1365
+ mask = 1. - mask
1366
+ with ema_scope("Plotting Outpaint"):
1367
+ samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
1368
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1369
+ x_samples = self.decode_first_stage(samples.to(self.device))
1370
+ log["samples_outpainting"] = x_samples
1371
+
1372
+ if plot_progressive_rows:
1373
+ with ema_scope("Plotting Progressives"):
1374
+ img, progressives = self.progressive_denoising(c,
1375
+ shape=(self.channels, self.image_size, self.image_size),
1376
+ batch_size=N)
1377
+ prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
1378
+ log["progressive_row"] = prog_row
1379
+
1380
+ if return_keys:
1381
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
1382
+ return log
1383
+ else:
1384
+ return {key: log[key] for key in return_keys}
1385
+ return log
1386
+
1387
+ def configure_optimizers(self):
1388
+ lr = self.learning_rate
1389
+ params = []
1390
+ if self.unet_trainable == "attn":
1391
+ print("Training only unet attention layers")
1392
+ for n, m in self.model.named_modules():
1393
+ if isinstance(m, CrossAttention) and n.endswith('attn2'):
1394
+ params.extend(m.parameters())
1395
+ if self.unet_trainable == "conv_in":
1396
+ print("Training only unet input conv layers")
1397
+ params = list(self.model.diffusion_model.input_blocks[0][0].parameters())
1398
+ elif self.unet_trainable is True or self.unet_trainable == "all":
1399
+ print("Training the full unet")
1400
+ params = list(self.model.parameters())
1401
+ else:
1402
+ raise ValueError(f"Unrecognised setting for unet_trainable: {self.unet_trainable}")
1403
+
1404
+ if self.cond_stage_trainable:
1405
+ print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
1406
+ params = params + list(self.cond_stage_model.parameters())
1407
+ if self.learn_logvar:
1408
+ print('Diffusion model optimizing logvar')
1409
+ params.append(self.logvar)
1410
+
1411
+ if self.cc_projection is not None:
1412
+ params = params + list(self.cc_projection.parameters())
1413
+ print('========== optimizing for cc projection weight ==========')
1414
+
1415
+ opt = torch.optim.AdamW([{"params": self.model.parameters(), "lr": lr},
1416
+ {"params": self.cc_projection.parameters(), "lr": 10. * lr}], lr=lr)
1417
+ if self.use_scheduler:
1418
+ assert 'target' in self.scheduler_config
1419
+ scheduler = instantiate_from_config(self.scheduler_config)
1420
+
1421
+ print("Setting up LambdaLR scheduler...")
1422
+ scheduler = [
1423
+ {
1424
+ 'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
1425
+ 'interval': 'step',
1426
+ 'frequency': 1
1427
+ }]
1428
+ return [opt], scheduler
1429
+ return opt
1430
+
1431
+ @torch.no_grad()
1432
+ def to_rgb(self, x):
1433
+ x = x.float()
1434
+ if not hasattr(self, "colorize"):
1435
+ self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
1436
+ x = nn.functional.conv2d(x, weight=self.colorize)
1437
+ x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
1438
+ return x
1439
+
1440
+
1441
+ class DiffusionWrapper(pl.LightningModule):
1442
+ def __init__(self, diff_model_config, conditioning_key):
1443
+ super().__init__()
1444
+ self.diffusion_model = instantiate_from_config(diff_model_config)
1445
+ self.conditioning_key = conditioning_key
1446
+ assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm']
1447
+
1448
+ def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None):
1449
+ if self.conditioning_key is None:
1450
+ out = self.diffusion_model(x, t)
1451
+ elif self.conditioning_key == 'concat':
1452
+ xc = torch.cat([x] + c_concat, dim=1)
1453
+ out = self.diffusion_model(xc, t)
1454
+ elif self.conditioning_key == 'crossattn':
1455
+ # c_crossattn dimension: torch.Size([8, 1, 768]) 1
1456
+ # cc dimension: torch.Size([8, 1, 768]
1457
+ cc = torch.cat(c_crossattn, 1)
1458
+ out = self.diffusion_model(x, t, context=cc)
1459
+ elif self.conditioning_key == 'hybrid':
1460
+ xc = torch.cat([x] + c_concat, dim=1)
1461
+ cc = torch.cat(c_crossattn, 1)
1462
+ out = self.diffusion_model(xc, t, context=cc)
1463
+ elif self.conditioning_key == 'hybrid-adm':
1464
+ assert c_adm is not None
1465
+ xc = torch.cat([x] + c_concat, dim=1)
1466
+ cc = torch.cat(c_crossattn, 1)
1467
+ out = self.diffusion_model(xc, t, context=cc, y=c_adm)
1468
+ elif self.conditioning_key == 'adm':
1469
+ cc = c_crossattn[0]
1470
+ out = self.diffusion_model(x, t, y=cc)
1471
+ else:
1472
+ raise NotImplementedError()
1473
+
1474
+ return out
1475
+
1476
+
1477
+ class LatentUpscaleDiffusion(LatentDiffusion):
1478
+ def __init__(self, *args, low_scale_config, low_scale_key="LR", **kwargs):
1479
+ super().__init__(*args, **kwargs)
1480
+ # assumes that neither the cond_stage nor the low_scale_model contain trainable params
1481
+ assert not self.cond_stage_trainable
1482
+ self.instantiate_low_stage(low_scale_config)
1483
+ self.low_scale_key = low_scale_key
1484
+
1485
+ def instantiate_low_stage(self, config):
1486
+ model = instantiate_from_config(config)
1487
+ self.low_scale_model = model.eval()
1488
+ self.low_scale_model.train = disabled_train
1489
+ for param in self.low_scale_model.parameters():
1490
+ param.requires_grad = False
1491
+
1492
+ @torch.no_grad()
1493
+ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
1494
+ if not log_mode:
1495
+ z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
1496
+ else:
1497
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1498
+ force_c_encode=True, return_original_cond=True, bs=bs)
1499
+ x_low = batch[self.low_scale_key][:bs]
1500
+ x_low = rearrange(x_low, 'b h w c -> b c h w')
1501
+ x_low = x_low.to(memory_format=torch.contiguous_format).float()
1502
+ zx, noise_level = self.low_scale_model(x_low)
1503
+ all_conds = {"c_concat": [zx], "c_crossattn": [c], "c_adm": noise_level}
1504
+ #import pudb; pu.db
1505
+ if log_mode:
1506
+ # TODO: maybe disable if too expensive
1507
+ interpretability = False
1508
+ if interpretability:
1509
+ zx = zx[:, :, ::2, ::2]
1510
+ x_low_rec = self.low_scale_model.decode(zx)
1511
+ return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level
1512
+ return z, all_conds
1513
+
1514
+ @torch.no_grad()
1515
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1516
+ plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
1517
+ unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
1518
+ **kwargs):
1519
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1520
+ use_ddim = ddim_steps is not None
1521
+
1522
+ log = dict()
1523
+ z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch, self.first_stage_key, bs=N,
1524
+ log_mode=True)
1525
+ N = min(x.shape[0], N)
1526
+ n_row = min(x.shape[0], n_row)
1527
+ log["inputs"] = x
1528
+ log["reconstruction"] = xrec
1529
+ log["x_lr"] = x_low
1530
+ log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec
1531
+ if self.model.conditioning_key is not None:
1532
+ if hasattr(self.cond_stage_model, "decode"):
1533
+ xc = self.cond_stage_model.decode(c)
1534
+ log["conditioning"] = xc
1535
+ elif self.cond_stage_key in ["caption", "txt"]:
1536
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2]//25)
1537
+ log["conditioning"] = xc
1538
+ elif self.cond_stage_key == 'class_label':
1539
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2]//25)
1540
+ log['conditioning'] = xc
1541
+ elif isimage(xc):
1542
+ log["conditioning"] = xc
1543
+ if ismap(xc):
1544
+ log["original_conditioning"] = self.to_rgb(xc)
1545
+
1546
+ if plot_diffusion_rows:
1547
+ # get diffusion row
1548
+ diffusion_row = list()
1549
+ z_start = z[:n_row]
1550
+ for t in range(self.num_timesteps):
1551
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1552
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1553
+ t = t.to(self.device).long()
1554
+ noise = torch.randn_like(z_start)
1555
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1556
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1557
+
1558
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1559
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1560
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1561
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1562
+ log["diffusion_row"] = diffusion_grid
1563
+
1564
+ if sample:
1565
+ # get denoise row
1566
+ with ema_scope("Sampling"):
1567
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1568
+ ddim_steps=ddim_steps, eta=ddim_eta)
1569
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1570
+ x_samples = self.decode_first_stage(samples)
1571
+ log["samples"] = x_samples
1572
+ if plot_denoise_rows:
1573
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1574
+ log["denoise_row"] = denoise_grid
1575
+
1576
+ if unconditional_guidance_scale > 1.0:
1577
+ uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1578
+ # TODO explore better "unconditional" choices for the other keys
1579
+ # maybe guide away from empty text label and highest noise level and maximally degraded zx?
1580
+ uc = dict()
1581
+ for k in c:
1582
+ if k == "c_crossattn":
1583
+ assert isinstance(c[k], list) and len(c[k]) == 1
1584
+ uc[k] = [uc_tmp]
1585
+ elif k == "c_adm": # todo: only run with text-based guidance?
1586
+ assert isinstance(c[k], torch.Tensor)
1587
+ uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
1588
+ elif isinstance(c[k], list):
1589
+ uc[k] = [c[k][i] for i in range(len(c[k]))]
1590
+ else:
1591
+ uc[k] = c[k]
1592
+
1593
+ with ema_scope("Sampling with classifier-free guidance"):
1594
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1595
+ ddim_steps=ddim_steps, eta=ddim_eta,
1596
+ unconditional_guidance_scale=unconditional_guidance_scale,
1597
+ unconditional_conditioning=uc,
1598
+ )
1599
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1600
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1601
+
1602
+ if plot_progressive_rows:
1603
+ with ema_scope("Plotting Progressives"):
1604
+ img, progressives = self.progressive_denoising(c,
1605
+ shape=(self.channels, self.image_size, self.image_size),
1606
+ batch_size=N)
1607
+ prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
1608
+ log["progressive_row"] = prog_row
1609
+
1610
+ return log
1611
+
1612
+
1613
+ class LatentInpaintDiffusion(LatentDiffusion):
1614
+ """
1615
+ can either run as pure inpainting model (only concat mode) or with mixed conditionings,
1616
+ e.g. mask as concat and text via cross-attn.
1617
+ To disable finetuning mode, set finetune_keys to None
1618
+ """
1619
+ def __init__(self,
1620
+ finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
1621
+ "model_ema.diffusion_modelinput_blocks00weight"
1622
+ ),
1623
+ concat_keys=("mask", "masked_image"),
1624
+ masked_image_key="masked_image",
1625
+ keep_finetune_dims=4, # if model was trained without concat mode before and we would like to keep these channels
1626
+ c_concat_log_start=None, # to log reconstruction of c_concat codes
1627
+ c_concat_log_end=None,
1628
+ *args, **kwargs
1629
+ ):
1630
+ ckpt_path = kwargs.pop("ckpt_path", None)
1631
+ ignore_keys = kwargs.pop("ignore_keys", list())
1632
+ super().__init__(*args, **kwargs)
1633
+ self.masked_image_key = masked_image_key
1634
+ assert self.masked_image_key in concat_keys
1635
+ self.finetune_keys = finetune_keys
1636
+ self.concat_keys = concat_keys
1637
+ self.keep_dims = keep_finetune_dims
1638
+ self.c_concat_log_start = c_concat_log_start
1639
+ self.c_concat_log_end = c_concat_log_end
1640
+ if exists(self.finetune_keys): assert exists(ckpt_path), 'can only finetune from a given checkpoint'
1641
+ if exists(ckpt_path):
1642
+ self.init_from_ckpt(ckpt_path, ignore_keys)
1643
+
1644
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
1645
+ sd = torch.load(path, map_location="cpu")
1646
+ if "state_dict" in list(sd.keys()):
1647
+ sd = sd["state_dict"]
1648
+ keys = list(sd.keys())
1649
+ for k in keys:
1650
+ for ik in ignore_keys:
1651
+ if k.startswith(ik):
1652
+ print("Deleting key {} from state_dict.".format(k))
1653
+ del sd[k]
1654
+
1655
+ # make it explicit, finetune by including extra input channels
1656
+ if exists(self.finetune_keys) and k in self.finetune_keys:
1657
+ new_entry = None
1658
+ for name, param in self.named_parameters():
1659
+ if name in self.finetune_keys:
1660
+ print(f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only")
1661
+ new_entry = torch.zeros_like(param) # zero init
1662
+ assert exists(new_entry), 'did not find matching parameter to modify'
1663
+ new_entry[:, :self.keep_dims, ...] = sd[k]
1664
+ sd[k] = new_entry
1665
+
1666
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(sd, strict=False)
1667
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
1668
+ if len(missing) > 0:
1669
+ print(f"Missing Keys: {missing}")
1670
+ if len(unexpected) > 0:
1671
+ print(f"Unexpected Keys: {unexpected}")
1672
+
1673
+ @torch.no_grad()
1674
+ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
1675
+ # note: restricted to non-trainable encoders currently
1676
+ assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
1677
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1678
+ force_c_encode=True, return_original_cond=True, bs=bs)
1679
+
1680
+ assert exists(self.concat_keys)
1681
+ c_cat = list()
1682
+ for ck in self.concat_keys:
1683
+ cc = rearrange(batch[ck], 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
1684
+ if bs is not None:
1685
+ cc = cc[:bs]
1686
+ cc = cc.to(self.device)
1687
+ bchw = z.shape
1688
+ if ck != self.masked_image_key:
1689
+ cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
1690
+ else:
1691
+ cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
1692
+ c_cat.append(cc)
1693
+ c_cat = torch.cat(c_cat, dim=1)
1694
+ all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
1695
+ if return_first_stage_outputs:
1696
+ return z, all_conds, x, xrec, xc
1697
+ return z, all_conds
1698
+
1699
+ @torch.no_grad()
1700
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1701
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
1702
+ plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
1703
+ use_ema_scope=True,
1704
+ **kwargs):
1705
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1706
+ use_ddim = ddim_steps is not None
1707
+
1708
+ log = dict()
1709
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True)
1710
+ c_cat, c = c["c_concat"][0], c["c_crossattn"][0]
1711
+ N = min(x.shape[0], N)
1712
+ n_row = min(x.shape[0], n_row)
1713
+ log["inputs"] = x
1714
+ log["reconstruction"] = xrec
1715
+ if self.model.conditioning_key is not None:
1716
+ if hasattr(self.cond_stage_model, "decode"):
1717
+ xc = self.cond_stage_model.decode(c)
1718
+ log["conditioning"] = xc
1719
+ elif self.cond_stage_key in ["caption", "txt"]:
1720
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
1721
+ log["conditioning"] = xc
1722
+ elif self.cond_stage_key == 'class_label':
1723
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
1724
+ log['conditioning'] = xc
1725
+ elif isimage(xc):
1726
+ log["conditioning"] = xc
1727
+ if ismap(xc):
1728
+ log["original_conditioning"] = self.to_rgb(xc)
1729
+
1730
+ if not (self.c_concat_log_start is None and self.c_concat_log_end is None):
1731
+ log["c_concat_decoded"] = self.decode_first_stage(c_cat[:,self.c_concat_log_start:self.c_concat_log_end])
1732
+
1733
+ if plot_diffusion_rows:
1734
+ # get diffusion row
1735
+ diffusion_row = list()
1736
+ z_start = z[:n_row]
1737
+ for t in range(self.num_timesteps):
1738
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1739
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1740
+ t = t.to(self.device).long()
1741
+ noise = torch.randn_like(z_start)
1742
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1743
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1744
+
1745
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1746
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1747
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1748
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1749
+ log["diffusion_row"] = diffusion_grid
1750
+
1751
+ if sample:
1752
+ # get denoise row
1753
+ with ema_scope("Sampling"):
1754
+ samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
1755
+ batch_size=N, ddim=use_ddim,
1756
+ ddim_steps=ddim_steps, eta=ddim_eta)
1757
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1758
+ x_samples = self.decode_first_stage(samples)
1759
+ log["samples"] = x_samples
1760
+ if plot_denoise_rows:
1761
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1762
+ log["denoise_row"] = denoise_grid
1763
+
1764
+ if unconditional_guidance_scale > 1.0:
1765
+ uc_cross = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1766
+ uc_cat = c_cat
1767
+ uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
1768
+ with ema_scope("Sampling with classifier-free guidance"):
1769
+ samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
1770
+ batch_size=N, ddim=use_ddim,
1771
+ ddim_steps=ddim_steps, eta=ddim_eta,
1772
+ unconditional_guidance_scale=unconditional_guidance_scale,
1773
+ unconditional_conditioning=uc_full,
1774
+ )
1775
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1776
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1777
+
1778
+ log["masked_image"] = rearrange(batch["masked_image"],
1779
+ 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
1780
+ return log
1781
+
1782
+
1783
+ class Layout2ImgDiffusion(LatentDiffusion):
1784
+ # TODO: move all layout-specific hacks to this class
1785
+ def __init__(self, cond_stage_key, *args, **kwargs):
1786
+ assert cond_stage_key == 'coordinates_bbox', 'Layout2ImgDiffusion only for cond_stage_key="coordinates_bbox"'
1787
+ super().__init__(cond_stage_key=cond_stage_key, *args, **kwargs)
1788
+
1789
+ def log_images(self, batch, N=8, *args, **kwargs):
1790
+ logs = super().log_images(batch=batch, N=N, *args, **kwargs)
1791
+
1792
+ key = 'train' if self.training else 'validation'
1793
+ dset = self.trainer.datamodule.datasets[key]
1794
+ mapper = dset.conditional_builders[self.cond_stage_key]
1795
+
1796
+ bbox_imgs = []
1797
+ map_fn = lambda catno: dset.get_textual_label(dset.get_category_id(catno))
1798
+ for tknzd_bbox in batch[self.cond_stage_key][:N]:
1799
+ bboximg = mapper.plot(tknzd_bbox.detach().cpu(), map_fn, (256, 256))
1800
+ bbox_imgs.append(bboximg)
1801
+
1802
+ cond_img = torch.stack(bbox_imgs, dim=0)
1803
+ logs['bbox_image'] = cond_img
1804
+ return logs
1805
+
1806
+
1807
+ class SimpleUpscaleDiffusion(LatentDiffusion):
1808
+ def __init__(self, *args, low_scale_key="LR", **kwargs):
1809
+ super().__init__(*args, **kwargs)
1810
+ # assumes that neither the cond_stage nor the low_scale_model contain trainable params
1811
+ assert not self.cond_stage_trainable
1812
+ self.low_scale_key = low_scale_key
1813
+
1814
+ @torch.no_grad()
1815
+ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
1816
+ if not log_mode:
1817
+ z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
1818
+ else:
1819
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1820
+ force_c_encode=True, return_original_cond=True, bs=bs)
1821
+ x_low = batch[self.low_scale_key][:bs]
1822
+ x_low = rearrange(x_low, 'b h w c -> b c h w')
1823
+ x_low = x_low.to(memory_format=torch.contiguous_format).float()
1824
+
1825
+ encoder_posterior = self.encode_first_stage(x_low)
1826
+ zx = self.get_first_stage_encoding(encoder_posterior).detach()
1827
+ all_conds = {"c_concat": [zx], "c_crossattn": [c]}
1828
+
1829
+ if log_mode:
1830
+ # TODO: maybe disable if too expensive
1831
+ interpretability = False
1832
+ if interpretability:
1833
+ zx = zx[:, :, ::2, ::2]
1834
+ return z, all_conds, x, xrec, xc, x_low
1835
+ return z, all_conds
1836
+
1837
+ @torch.no_grad()
1838
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1839
+ plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
1840
+ unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
1841
+ **kwargs):
1842
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1843
+ use_ddim = ddim_steps is not None
1844
+
1845
+ log = dict()
1846
+ z, c, x, xrec, xc, x_low = self.get_input(batch, self.first_stage_key, bs=N, log_mode=True)
1847
+ N = min(x.shape[0], N)
1848
+ n_row = min(x.shape[0], n_row)
1849
+ log["inputs"] = x
1850
+ log["reconstruction"] = xrec
1851
+ log["x_lr"] = x_low
1852
+
1853
+ if self.model.conditioning_key is not None:
1854
+ if hasattr(self.cond_stage_model, "decode"):
1855
+ xc = self.cond_stage_model.decode(c)
1856
+ log["conditioning"] = xc
1857
+ elif self.cond_stage_key in ["caption", "txt"]:
1858
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2]//25)
1859
+ log["conditioning"] = xc
1860
+ elif self.cond_stage_key == 'class_label':
1861
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2]//25)
1862
+ log['conditioning'] = xc
1863
+ elif isimage(xc):
1864
+ log["conditioning"] = xc
1865
+ if ismap(xc):
1866
+ log["original_conditioning"] = self.to_rgb(xc)
1867
+
1868
+ if sample:
1869
+ # get denoise row
1870
+ with ema_scope("Sampling"):
1871
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1872
+ ddim_steps=ddim_steps, eta=ddim_eta)
1873
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1874
+ x_samples = self.decode_first_stage(samples)
1875
+ log["samples"] = x_samples
1876
+
1877
+ if unconditional_guidance_scale > 1.0:
1878
+ uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1879
+ uc = dict()
1880
+ for k in c:
1881
+ if k == "c_crossattn":
1882
+ assert isinstance(c[k], list) and len(c[k]) == 1
1883
+ uc[k] = [uc_tmp]
1884
+ elif isinstance(c[k], list):
1885
+ uc[k] = [c[k][i] for i in range(len(c[k]))]
1886
+ else:
1887
+ uc[k] = c[k]
1888
+
1889
+ with ema_scope("Sampling with classifier-free guidance"):
1890
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1891
+ ddim_steps=ddim_steps, eta=ddim_eta,
1892
+ unconditional_guidance_scale=unconditional_guidance_scale,
1893
+ unconditional_conditioning=uc,
1894
+ )
1895
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1896
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1897
+ return log
1898
+
1899
+ class MultiCatFrameDiffusion(LatentDiffusion):
1900
+ def __init__(self, *args, low_scale_key="LR", **kwargs):
1901
+ super().__init__(*args, **kwargs)
1902
+ # assumes that neither the cond_stage nor the low_scale_model contain trainable params
1903
+ assert not self.cond_stage_trainable
1904
+ self.low_scale_key = low_scale_key
1905
+
1906
+ @torch.no_grad()
1907
+ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
1908
+ n = 2
1909
+ if not log_mode:
1910
+ z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
1911
+ else:
1912
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1913
+ force_c_encode=True, return_original_cond=True, bs=bs)
1914
+ cat_conds = batch[self.low_scale_key][:bs]
1915
+ cats = []
1916
+ for i in range(n):
1917
+ x_low = cat_conds[:,:,:,3*i:3*(i+1)]
1918
+ x_low = rearrange(x_low, 'b h w c -> b c h w')
1919
+ x_low = x_low.to(memory_format=torch.contiguous_format).float()
1920
+ encoder_posterior = self.encode_first_stage(x_low)
1921
+ zx = self.get_first_stage_encoding(encoder_posterior).detach()
1922
+ cats.append(zx)
1923
+
1924
+ all_conds = {"c_concat": [torch.cat(cats, dim=1)], "c_crossattn": [c]}
1925
+
1926
+ if log_mode:
1927
+ # TODO: maybe disable if too expensive
1928
+ interpretability = False
1929
+ if interpretability:
1930
+ zx = zx[:, :, ::2, ::2]
1931
+ return z, all_conds, x, xrec, xc, x_low
1932
+ return z, all_conds
1933
+
1934
+ @torch.no_grad()
1935
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1936
+ plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
1937
+ unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
1938
+ **kwargs):
1939
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1940
+ use_ddim = ddim_steps is not None
1941
+
1942
+ log = dict()
1943
+ z, c, x, xrec, xc, x_low = self.get_input(batch, self.first_stage_key, bs=N, log_mode=True)
1944
+ N = min(x.shape[0], N)
1945
+ n_row = min(x.shape[0], n_row)
1946
+ log["inputs"] = x
1947
+ log["reconstruction"] = xrec
1948
+ log["x_lr"] = x_low
1949
+
1950
+ if self.model.conditioning_key is not None:
1951
+ if hasattr(self.cond_stage_model, "decode"):
1952
+ xc = self.cond_stage_model.decode(c)
1953
+ log["conditioning"] = xc
1954
+ elif self.cond_stage_key in ["caption", "txt"]:
1955
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2]//25)
1956
+ log["conditioning"] = xc
1957
+ elif self.cond_stage_key == 'class_label':
1958
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2]//25)
1959
+ log['conditioning'] = xc
1960
+ elif isimage(xc):
1961
+ log["conditioning"] = xc
1962
+ if ismap(xc):
1963
+ log["original_conditioning"] = self.to_rgb(xc)
1964
+
1965
+ if sample:
1966
+ # get denoise row
1967
+ with ema_scope("Sampling"):
1968
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1969
+ ddim_steps=ddim_steps, eta=ddim_eta)
1970
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1971
+ x_samples = self.decode_first_stage(samples)
1972
+ log["samples"] = x_samples
1973
+
1974
+ if unconditional_guidance_scale > 1.0:
1975
+ uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1976
+ uc = dict()
1977
+ for k in c:
1978
+ if k == "c_crossattn":
1979
+ assert isinstance(c[k], list) and len(c[k]) == 1
1980
+ uc[k] = [uc_tmp]
1981
+ elif isinstance(c[k], list):
1982
+ uc[k] = [c[k][i] for i in range(len(c[k]))]
1983
+ else:
1984
+ uc[k] = c[k]
1985
+
1986
+ with ema_scope("Sampling with classifier-free guidance"):
1987
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1988
+ ddim_steps=ddim_steps, eta=ddim_eta,
1989
+ unconditional_guidance_scale=unconditional_guidance_scale,
1990
+ unconditional_conditioning=uc,
1991
+ )
1992
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1993
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1994
+ return log
ldm/models/diffusion/plms.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+
8
+ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
9
+ from ldm.models.diffusion.sampling_util import norm_thresholding
10
+
11
+
12
+ class PLMSSampler(object):
13
+ def __init__(self, model, schedule="linear", **kwargs):
14
+ super().__init__()
15
+ self.model = model
16
+ self.ddpm_num_timesteps = model.num_timesteps
17
+ self.schedule = schedule
18
+
19
+ def register_buffer(self, name, attr):
20
+ if type(attr) == torch.Tensor:
21
+ if attr.device != torch.device("cuda"):
22
+ attr = attr.to(torch.device("cuda"))
23
+ setattr(self, name, attr)
24
+
25
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
26
+ if ddim_eta != 0:
27
+ raise ValueError('ddim_eta must be 0 for PLMS')
28
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
29
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
30
+ alphas_cumprod = self.model.alphas_cumprod
31
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
32
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
33
+
34
+ self.register_buffer('betas', to_torch(self.model.betas))
35
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
36
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
37
+
38
+ # calculations for diffusion q(x_t | x_{t-1}) and others
39
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
40
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
41
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
42
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
43
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
44
+
45
+ # ddim sampling parameters
46
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
47
+ ddim_timesteps=self.ddim_timesteps,
48
+ eta=ddim_eta,verbose=verbose)
49
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
50
+ self.register_buffer('ddim_alphas', ddim_alphas)
51
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
52
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
53
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
54
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
55
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
56
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
57
+
58
+ @torch.no_grad()
59
+ def sample(self,
60
+ S,
61
+ batch_size,
62
+ shape,
63
+ conditioning=None,
64
+ callback=None,
65
+ normals_sequence=None,
66
+ img_callback=None,
67
+ quantize_x0=False,
68
+ eta=0.,
69
+ mask=None,
70
+ x0=None,
71
+ temperature=1.,
72
+ noise_dropout=0.,
73
+ score_corrector=None,
74
+ corrector_kwargs=None,
75
+ verbose=True,
76
+ x_T=None,
77
+ log_every_t=100,
78
+ unconditional_guidance_scale=1.,
79
+ unconditional_conditioning=None,
80
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
81
+ dynamic_threshold=None,
82
+ **kwargs
83
+ ):
84
+ if conditioning is not None:
85
+ if isinstance(conditioning, dict):
86
+ ctmp = conditioning[list(conditioning.keys())[0]]
87
+ while isinstance(ctmp, list): ctmp = ctmp[0]
88
+ cbs = ctmp.shape[0]
89
+ if cbs != batch_size:
90
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
91
+ else:
92
+ if conditioning.shape[0] != batch_size:
93
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
94
+
95
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
96
+ # sampling
97
+ C, H, W = shape
98
+ size = (batch_size, C, H, W)
99
+ print(f'Data shape for PLMS sampling is {size}')
100
+
101
+ samples, intermediates = self.plms_sampling(conditioning, size,
102
+ callback=callback,
103
+ img_callback=img_callback,
104
+ quantize_denoised=quantize_x0,
105
+ mask=mask, x0=x0,
106
+ ddim_use_original_steps=False,
107
+ noise_dropout=noise_dropout,
108
+ temperature=temperature,
109
+ score_corrector=score_corrector,
110
+ corrector_kwargs=corrector_kwargs,
111
+ x_T=x_T,
112
+ log_every_t=log_every_t,
113
+ unconditional_guidance_scale=unconditional_guidance_scale,
114
+ unconditional_conditioning=unconditional_conditioning,
115
+ dynamic_threshold=dynamic_threshold,
116
+ )
117
+ return samples, intermediates
118
+
119
+ @torch.no_grad()
120
+ def plms_sampling(self, cond, shape,
121
+ x_T=None, ddim_use_original_steps=False,
122
+ callback=None, timesteps=None, quantize_denoised=False,
123
+ mask=None, x0=None, img_callback=None, log_every_t=100,
124
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
125
+ unconditional_guidance_scale=1., unconditional_conditioning=None,
126
+ dynamic_threshold=None):
127
+ device = self.model.betas.device
128
+ b = shape[0]
129
+ if x_T is None:
130
+ img = torch.randn(shape, device=device)
131
+ else:
132
+ img = x_T
133
+
134
+ if timesteps is None:
135
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
136
+ elif timesteps is not None and not ddim_use_original_steps:
137
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
138
+ timesteps = self.ddim_timesteps[:subset_end]
139
+
140
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
141
+ time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
142
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
143
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
144
+
145
+ iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
146
+ old_eps = []
147
+
148
+ for i, step in enumerate(iterator):
149
+ index = total_steps - i - 1
150
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
151
+ ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
152
+
153
+ if mask is not None:
154
+ assert x0 is not None
155
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
156
+ img = img_orig * mask + (1. - mask) * img
157
+
158
+ outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
159
+ quantize_denoised=quantize_denoised, temperature=temperature,
160
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
161
+ corrector_kwargs=corrector_kwargs,
162
+ unconditional_guidance_scale=unconditional_guidance_scale,
163
+ unconditional_conditioning=unconditional_conditioning,
164
+ old_eps=old_eps, t_next=ts_next,
165
+ dynamic_threshold=dynamic_threshold)
166
+ img, pred_x0, e_t = outs
167
+ old_eps.append(e_t)
168
+ if len(old_eps) >= 4:
169
+ old_eps.pop(0)
170
+ if callback: callback(i)
171
+ if img_callback: img_callback(pred_x0, i)
172
+
173
+ if index % log_every_t == 0 or index == total_steps - 1:
174
+ intermediates['x_inter'].append(img)
175
+ intermediates['pred_x0'].append(pred_x0)
176
+
177
+ return img, intermediates
178
+
179
+ @torch.no_grad()
180
+ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
181
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
182
+ unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
183
+ dynamic_threshold=None):
184
+ b, *_, device = *x.shape, x.device
185
+
186
+ def get_model_output(x, t):
187
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
188
+ e_t = self.model.apply_model(x, t, c)
189
+ else:
190
+ x_in = torch.cat([x] * 2)
191
+ t_in = torch.cat([t] * 2)
192
+ if isinstance(c, dict):
193
+ assert isinstance(unconditional_conditioning, dict)
194
+ c_in = dict()
195
+ for k in c:
196
+ if isinstance(c[k], list):
197
+ c_in[k] = [torch.cat([
198
+ unconditional_conditioning[k][i],
199
+ c[k][i]]) for i in range(len(c[k]))]
200
+ else:
201
+ c_in[k] = torch.cat([
202
+ unconditional_conditioning[k],
203
+ c[k]])
204
+ else:
205
+ c_in = torch.cat([unconditional_conditioning, c])
206
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
207
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
208
+
209
+ if score_corrector is not None:
210
+ assert self.model.parameterization == "eps"
211
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
212
+
213
+ return e_t
214
+
215
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
216
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
217
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
218
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
219
+
220
+ def get_x_prev_and_pred_x0(e_t, index):
221
+ # select parameters corresponding to the currently considered timestep
222
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
223
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
224
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
225
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
226
+
227
+ # current prediction for x_0
228
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
229
+ if quantize_denoised:
230
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
231
+ if dynamic_threshold is not None:
232
+ pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
233
+ # direction pointing to x_t
234
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
235
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
236
+ if noise_dropout > 0.:
237
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
238
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
239
+ return x_prev, pred_x0
240
+
241
+ e_t = get_model_output(x, t)
242
+ if len(old_eps) == 0:
243
+ # Pseudo Improved Euler (2nd order)
244
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
245
+ e_t_next = get_model_output(x_prev, t_next)
246
+ e_t_prime = (e_t + e_t_next) / 2
247
+ elif len(old_eps) == 1:
248
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
249
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
250
+ elif len(old_eps) == 2:
251
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
252
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
253
+ elif len(old_eps) >= 3:
254
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
255
+ e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
256
+
257
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
258
+
259
+ return x_prev, pred_x0, e_t
ldm/models/diffusion/sampling_util.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ def append_dims(x, target_dims):
6
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions.
7
+ From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
8
+ dims_to_append = target_dims - x.ndim
9
+ if dims_to_append < 0:
10
+ raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11
+ return x[(...,) + (None,) * dims_to_append]
12
+
13
+
14
+ def renorm_thresholding(x0, value):
15
+ # renorm
16
+ pred_max = x0.max()
17
+ pred_min = x0.min()
18
+ pred_x0 = (x0 - pred_min) / (pred_max - pred_min) # 0 ... 1
19
+ pred_x0 = 2 * pred_x0 - 1. # -1 ... 1
20
+
21
+ s = torch.quantile(
22
+ rearrange(pred_x0, 'b ... -> b (...)').abs(),
23
+ value,
24
+ dim=-1
25
+ )
26
+ s.clamp_(min=1.0)
27
+ s = s.view(-1, *((1,) * (pred_x0.ndim - 1)))
28
+
29
+ # clip by threshold
30
+ # pred_x0 = pred_x0.clamp(-s, s) / s # needs newer pytorch # TODO bring back to pure-gpu with min/max
31
+
32
+ # temporary hack: numpy on cpu
33
+ pred_x0 = np.clip(pred_x0.cpu().numpy(), -s.cpu().numpy(), s.cpu().numpy()) / s.cpu().numpy()
34
+ pred_x0 = torch.tensor(pred_x0).to(self.model.device)
35
+
36
+ # re.renorm
37
+ pred_x0 = (pred_x0 + 1.) / 2. # 0 ... 1
38
+ pred_x0 = (pred_max - pred_min) * pred_x0 + pred_min # orig range
39
+ return pred_x0
40
+
41
+
42
+ def norm_thresholding(x0, value):
43
+ s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
44
+ return x0 * (value / s)
45
+
46
+
47
+ def spatial_norm_thresholding(x0, value):
48
+ # b c h w
49
+ s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
50
+ return x0 * (value / s)
ldm/modules/attention.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inspect import isfunction
2
+ import math
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn, einsum
6
+ from einops import rearrange, repeat
7
+
8
+ from ldm.modules.diffusionmodules.util import checkpoint
9
+
10
+
11
+ def exists(val):
12
+ return val is not None
13
+
14
+
15
+ def uniq(arr):
16
+ return{el: True for el in arr}.keys()
17
+
18
+
19
+ def default(val, d):
20
+ if exists(val):
21
+ return val
22
+ return d() if isfunction(d) else d
23
+
24
+
25
+ def max_neg_value(t):
26
+ return -torch.finfo(t.dtype).max
27
+
28
+
29
+ def init_(tensor):
30
+ dim = tensor.shape[-1]
31
+ std = 1 / math.sqrt(dim)
32
+ tensor.uniform_(-std, std)
33
+ return tensor
34
+
35
+
36
+ # feedforward
37
+ class GEGLU(nn.Module):
38
+ def __init__(self, dim_in, dim_out):
39
+ super().__init__()
40
+ self.proj = nn.Linear(dim_in, dim_out * 2)
41
+
42
+ def forward(self, x):
43
+ x, gate = self.proj(x).chunk(2, dim=-1)
44
+ return x * F.gelu(gate)
45
+
46
+
47
+ class FeedForward(nn.Module):
48
+ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
49
+ super().__init__()
50
+ inner_dim = int(dim * mult)
51
+ dim_out = default(dim_out, dim)
52
+ project_in = nn.Sequential(
53
+ nn.Linear(dim, inner_dim),
54
+ nn.GELU()
55
+ ) if not glu else GEGLU(dim, inner_dim)
56
+
57
+ self.net = nn.Sequential(
58
+ project_in,
59
+ nn.Dropout(dropout),
60
+ nn.Linear(inner_dim, dim_out)
61
+ )
62
+
63
+ def forward(self, x):
64
+ return self.net(x)
65
+
66
+
67
+ def zero_module(module):
68
+ """
69
+ Zero out the parameters of a module and return it.
70
+ """
71
+ for p in module.parameters():
72
+ p.detach().zero_()
73
+ return module
74
+
75
+
76
+ def Normalize(in_channels):
77
+ return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
78
+
79
+
80
+ class LinearAttention(nn.Module):
81
+ def __init__(self, dim, heads=4, dim_head=32):
82
+ super().__init__()
83
+ self.heads = heads
84
+ hidden_dim = dim_head * heads
85
+ self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
86
+ self.to_out = nn.Conv2d(hidden_dim, dim, 1)
87
+
88
+ def forward(self, x):
89
+ b, c, h, w = x.shape
90
+ qkv = self.to_qkv(x)
91
+ q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
92
+ k = k.softmax(dim=-1)
93
+ context = torch.einsum('bhdn,bhen->bhde', k, v)
94
+ out = torch.einsum('bhde,bhdn->bhen', context, q)
95
+ out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
96
+ return self.to_out(out)
97
+
98
+
99
+ class SpatialSelfAttention(nn.Module):
100
+ def __init__(self, in_channels):
101
+ super().__init__()
102
+ self.in_channels = in_channels
103
+
104
+ self.norm = Normalize(in_channels)
105
+ self.q = torch.nn.Conv2d(in_channels,
106
+ in_channels,
107
+ kernel_size=1,
108
+ stride=1,
109
+ padding=0)
110
+ self.k = torch.nn.Conv2d(in_channels,
111
+ in_channels,
112
+ kernel_size=1,
113
+ stride=1,
114
+ padding=0)
115
+ self.v = torch.nn.Conv2d(in_channels,
116
+ in_channels,
117
+ kernel_size=1,
118
+ stride=1,
119
+ padding=0)
120
+ self.proj_out = torch.nn.Conv2d(in_channels,
121
+ in_channels,
122
+ kernel_size=1,
123
+ stride=1,
124
+ padding=0)
125
+
126
+ def forward(self, x):
127
+ h_ = x
128
+ h_ = self.norm(h_)
129
+ q = self.q(h_)
130
+ k = self.k(h_)
131
+ v = self.v(h_)
132
+
133
+ # compute attention
134
+ b,c,h,w = q.shape
135
+ q = rearrange(q, 'b c h w -> b (h w) c')
136
+ k = rearrange(k, 'b c h w -> b c (h w)')
137
+ w_ = torch.einsum('bij,bjk->bik', q, k)
138
+
139
+ w_ = w_ * (int(c)**(-0.5))
140
+ w_ = torch.nn.functional.softmax(w_, dim=2)
141
+
142
+ # attend to values
143
+ v = rearrange(v, 'b c h w -> b c (h w)')
144
+ w_ = rearrange(w_, 'b i j -> b j i')
145
+ h_ = torch.einsum('bij,bjk->bik', v, w_)
146
+ h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
147
+ h_ = self.proj_out(h_)
148
+
149
+ return x+h_
150
+
151
+
152
+ class CrossAttention(nn.Module):
153
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
154
+ super().__init__()
155
+ inner_dim = dim_head * heads
156
+ context_dim = default(context_dim, query_dim)
157
+
158
+ self.scale = dim_head ** -0.5
159
+ self.heads = heads
160
+
161
+ self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
162
+ self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
163
+ self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
164
+
165
+ self.to_out = nn.Sequential(
166
+ nn.Linear(inner_dim, query_dim),
167
+ nn.Dropout(dropout)
168
+ )
169
+
170
+ def forward(self, x, context=None, mask=None):
171
+ h = self.heads
172
+
173
+ q = self.to_q(x)
174
+ context = default(context, x)
175
+ k = self.to_k(context)
176
+ v = self.to_v(context)
177
+
178
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
179
+
180
+ sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
181
+
182
+ if exists(mask):
183
+ mask = rearrange(mask, 'b ... -> b (...)')
184
+ max_neg_value = -torch.finfo(sim.dtype).max
185
+ mask = repeat(mask, 'b j -> (b h) () j', h=h)
186
+ sim.masked_fill_(~mask, max_neg_value)
187
+
188
+ # attention, what we cannot get enough of
189
+ attn = sim.softmax(dim=-1)
190
+
191
+ out = einsum('b i j, b j d -> b i d', attn, v)
192
+ out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
193
+ return self.to_out(out)
194
+
195
+
196
+ class BasicTransformerBlock(nn.Module):
197
+ def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
198
+ disable_self_attn=False):
199
+ super().__init__()
200
+ self.disable_self_attn = disable_self_attn
201
+ self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
202
+ context_dim=context_dim if self.disable_self_attn else None) # is a self-attention if not self.disable_self_attn
203
+ self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
204
+ self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
205
+ heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
206
+ self.norm1 = nn.LayerNorm(dim)
207
+ self.norm2 = nn.LayerNorm(dim)
208
+ self.norm3 = nn.LayerNorm(dim)
209
+ self.checkpoint = checkpoint
210
+
211
+ def forward(self, x, context=None):
212
+ return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
213
+
214
+ def _forward(self, x, context=None):
215
+ x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
216
+ x = self.attn2(self.norm2(x), context=context) + x
217
+ x = self.ff(self.norm3(x)) + x
218
+ return x
219
+
220
+
221
+ class SpatialTransformer(nn.Module):
222
+ """
223
+ Transformer block for image-like data.
224
+ First, project the input (aka embedding)
225
+ and reshape to b, t, d.
226
+ Then apply standard transformer action.
227
+ Finally, reshape to image
228
+ """
229
+ def __init__(self, in_channels, n_heads, d_head,
230
+ depth=1, dropout=0., context_dim=None,
231
+ disable_self_attn=False):
232
+ super().__init__()
233
+ self.in_channels = in_channels
234
+ inner_dim = n_heads * d_head
235
+ self.norm = Normalize(in_channels)
236
+
237
+ self.proj_in = nn.Conv2d(in_channels,
238
+ inner_dim,
239
+ kernel_size=1,
240
+ stride=1,
241
+ padding=0)
242
+
243
+ self.transformer_blocks = nn.ModuleList(
244
+ [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim,
245
+ disable_self_attn=disable_self_attn)
246
+ for d in range(depth)]
247
+ )
248
+
249
+ self.proj_out = zero_module(nn.Conv2d(inner_dim,
250
+ in_channels,
251
+ kernel_size=1,
252
+ stride=1,
253
+ padding=0))
254
+
255
+ def forward(self, x, context=None):
256
+ # note: if no context is given, cross-attention defaults to self-attention
257
+ b, c, h, w = x.shape
258
+ x_in = x
259
+ x = self.norm(x)
260
+ x = self.proj_in(x)
261
+ x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
262
+ for block in self.transformer_blocks:
263
+ x = block(x, context=context)
264
+ x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
265
+ x = self.proj_out(x)
266
+ return x + x_in
ldm/modules/diffusionmodules/__init__.py ADDED
File without changes
ldm/modules/diffusionmodules/model.py ADDED
@@ -0,0 +1,835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_diffusion + derived encoder decoder
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ import numpy as np
6
+ from einops import rearrange
7
+
8
+ from ldm.util import instantiate_from_config
9
+ from ldm.modules.attention import LinearAttention
10
+
11
+
12
+ def get_timestep_embedding(timesteps, embedding_dim):
13
+ """
14
+ This matches the implementation in Denoising Diffusion Probabilistic Models:
15
+ From Fairseq.
16
+ Build sinusoidal embeddings.
17
+ This matches the implementation in tensor2tensor, but differs slightly
18
+ from the description in Section 3.5 of "Attention Is All You Need".
19
+ """
20
+ assert len(timesteps.shape) == 1
21
+
22
+ half_dim = embedding_dim // 2
23
+ emb = math.log(10000) / (half_dim - 1)
24
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
25
+ emb = emb.to(device=timesteps.device)
26
+ emb = timesteps.float()[:, None] * emb[None, :]
27
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
28
+ if embedding_dim % 2 == 1: # zero pad
29
+ emb = torch.nn.functional.pad(emb, (0,1,0,0))
30
+ return emb
31
+
32
+
33
+ def nonlinearity(x):
34
+ # swish
35
+ return x*torch.sigmoid(x)
36
+
37
+
38
+ def Normalize(in_channels, num_groups=32):
39
+ return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
40
+
41
+
42
+ class Upsample(nn.Module):
43
+ def __init__(self, in_channels, with_conv):
44
+ super().__init__()
45
+ self.with_conv = with_conv
46
+ if self.with_conv:
47
+ self.conv = torch.nn.Conv2d(in_channels,
48
+ in_channels,
49
+ kernel_size=3,
50
+ stride=1,
51
+ padding=1)
52
+
53
+ def forward(self, x):
54
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
55
+ if self.with_conv:
56
+ x = self.conv(x)
57
+ return x
58
+
59
+
60
+ class Downsample(nn.Module):
61
+ def __init__(self, in_channels, with_conv):
62
+ super().__init__()
63
+ self.with_conv = with_conv
64
+ if self.with_conv:
65
+ # no asymmetric padding in torch conv, must do it ourselves
66
+ self.conv = torch.nn.Conv2d(in_channels,
67
+ in_channels,
68
+ kernel_size=3,
69
+ stride=2,
70
+ padding=0)
71
+
72
+ def forward(self, x):
73
+ if self.with_conv:
74
+ pad = (0,1,0,1)
75
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
76
+ x = self.conv(x)
77
+ else:
78
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
79
+ return x
80
+
81
+
82
+ class ResnetBlock(nn.Module):
83
+ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
84
+ dropout, temb_channels=512):
85
+ super().__init__()
86
+ self.in_channels = in_channels
87
+ out_channels = in_channels if out_channels is None else out_channels
88
+ self.out_channels = out_channels
89
+ self.use_conv_shortcut = conv_shortcut
90
+
91
+ self.norm1 = Normalize(in_channels)
92
+ self.conv1 = torch.nn.Conv2d(in_channels,
93
+ out_channels,
94
+ kernel_size=3,
95
+ stride=1,
96
+ padding=1)
97
+ if temb_channels > 0:
98
+ self.temb_proj = torch.nn.Linear(temb_channels,
99
+ out_channels)
100
+ self.norm2 = Normalize(out_channels)
101
+ self.dropout = torch.nn.Dropout(dropout)
102
+ self.conv2 = torch.nn.Conv2d(out_channels,
103
+ out_channels,
104
+ kernel_size=3,
105
+ stride=1,
106
+ padding=1)
107
+ if self.in_channels != self.out_channels:
108
+ if self.use_conv_shortcut:
109
+ self.conv_shortcut = torch.nn.Conv2d(in_channels,
110
+ out_channels,
111
+ kernel_size=3,
112
+ stride=1,
113
+ padding=1)
114
+ else:
115
+ self.nin_shortcut = torch.nn.Conv2d(in_channels,
116
+ out_channels,
117
+ kernel_size=1,
118
+ stride=1,
119
+ padding=0)
120
+
121
+ def forward(self, x, temb):
122
+ h = x
123
+ h = self.norm1(h)
124
+ h = nonlinearity(h)
125
+ h = self.conv1(h)
126
+
127
+ if temb is not None:
128
+ h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
129
+
130
+ h = self.norm2(h)
131
+ h = nonlinearity(h)
132
+ h = self.dropout(h)
133
+ h = self.conv2(h)
134
+
135
+ if self.in_channels != self.out_channels:
136
+ if self.use_conv_shortcut:
137
+ x = self.conv_shortcut(x)
138
+ else:
139
+ x = self.nin_shortcut(x)
140
+
141
+ return x+h
142
+
143
+
144
+ class LinAttnBlock(LinearAttention):
145
+ """to match AttnBlock usage"""
146
+ def __init__(self, in_channels):
147
+ super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
148
+
149
+
150
+ class AttnBlock(nn.Module):
151
+ def __init__(self, in_channels):
152
+ super().__init__()
153
+ self.in_channels = in_channels
154
+
155
+ self.norm = Normalize(in_channels)
156
+ self.q = torch.nn.Conv2d(in_channels,
157
+ in_channels,
158
+ kernel_size=1,
159
+ stride=1,
160
+ padding=0)
161
+ self.k = torch.nn.Conv2d(in_channels,
162
+ in_channels,
163
+ kernel_size=1,
164
+ stride=1,
165
+ padding=0)
166
+ self.v = torch.nn.Conv2d(in_channels,
167
+ in_channels,
168
+ kernel_size=1,
169
+ stride=1,
170
+ padding=0)
171
+ self.proj_out = torch.nn.Conv2d(in_channels,
172
+ in_channels,
173
+ kernel_size=1,
174
+ stride=1,
175
+ padding=0)
176
+
177
+
178
+ def forward(self, x):
179
+ h_ = x
180
+ h_ = self.norm(h_)
181
+ q = self.q(h_)
182
+ k = self.k(h_)
183
+ v = self.v(h_)
184
+
185
+ # compute attention
186
+ b,c,h,w = q.shape
187
+ q = q.reshape(b,c,h*w)
188
+ q = q.permute(0,2,1) # b,hw,c
189
+ k = k.reshape(b,c,h*w) # b,c,hw
190
+ w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
191
+ w_ = w_ * (int(c)**(-0.5))
192
+ w_ = torch.nn.functional.softmax(w_, dim=2)
193
+
194
+ # attend to values
195
+ v = v.reshape(b,c,h*w)
196
+ w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
197
+ h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
198
+ h_ = h_.reshape(b,c,h,w)
199
+
200
+ h_ = self.proj_out(h_)
201
+
202
+ return x+h_
203
+
204
+
205
+ def make_attn(in_channels, attn_type="vanilla"):
206
+ assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
207
+ print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
208
+ if attn_type == "vanilla":
209
+ return AttnBlock(in_channels)
210
+ elif attn_type == "none":
211
+ return nn.Identity(in_channels)
212
+ else:
213
+ return LinAttnBlock(in_channels)
214
+
215
+
216
+ class Model(nn.Module):
217
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
218
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
219
+ resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
220
+ super().__init__()
221
+ if use_linear_attn: attn_type = "linear"
222
+ self.ch = ch
223
+ self.temb_ch = self.ch*4
224
+ self.num_resolutions = len(ch_mult)
225
+ self.num_res_blocks = num_res_blocks
226
+ self.resolution = resolution
227
+ self.in_channels = in_channels
228
+
229
+ self.use_timestep = use_timestep
230
+ if self.use_timestep:
231
+ # timestep embedding
232
+ self.temb = nn.Module()
233
+ self.temb.dense = nn.ModuleList([
234
+ torch.nn.Linear(self.ch,
235
+ self.temb_ch),
236
+ torch.nn.Linear(self.temb_ch,
237
+ self.temb_ch),
238
+ ])
239
+
240
+ # downsampling
241
+ self.conv_in = torch.nn.Conv2d(in_channels,
242
+ self.ch,
243
+ kernel_size=3,
244
+ stride=1,
245
+ padding=1)
246
+
247
+ curr_res = resolution
248
+ in_ch_mult = (1,)+tuple(ch_mult)
249
+ self.down = nn.ModuleList()
250
+ for i_level in range(self.num_resolutions):
251
+ block = nn.ModuleList()
252
+ attn = nn.ModuleList()
253
+ block_in = ch*in_ch_mult[i_level]
254
+ block_out = ch*ch_mult[i_level]
255
+ for i_block in range(self.num_res_blocks):
256
+ block.append(ResnetBlock(in_channels=block_in,
257
+ out_channels=block_out,
258
+ temb_channels=self.temb_ch,
259
+ dropout=dropout))
260
+ block_in = block_out
261
+ if curr_res in attn_resolutions:
262
+ attn.append(make_attn(block_in, attn_type=attn_type))
263
+ down = nn.Module()
264
+ down.block = block
265
+ down.attn = attn
266
+ if i_level != self.num_resolutions-1:
267
+ down.downsample = Downsample(block_in, resamp_with_conv)
268
+ curr_res = curr_res // 2
269
+ self.down.append(down)
270
+
271
+ # middle
272
+ self.mid = nn.Module()
273
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
274
+ out_channels=block_in,
275
+ temb_channels=self.temb_ch,
276
+ dropout=dropout)
277
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
278
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
279
+ out_channels=block_in,
280
+ temb_channels=self.temb_ch,
281
+ dropout=dropout)
282
+
283
+ # upsampling
284
+ self.up = nn.ModuleList()
285
+ for i_level in reversed(range(self.num_resolutions)):
286
+ block = nn.ModuleList()
287
+ attn = nn.ModuleList()
288
+ block_out = ch*ch_mult[i_level]
289
+ skip_in = ch*ch_mult[i_level]
290
+ for i_block in range(self.num_res_blocks+1):
291
+ if i_block == self.num_res_blocks:
292
+ skip_in = ch*in_ch_mult[i_level]
293
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
294
+ out_channels=block_out,
295
+ temb_channels=self.temb_ch,
296
+ dropout=dropout))
297
+ block_in = block_out
298
+ if curr_res in attn_resolutions:
299
+ attn.append(make_attn(block_in, attn_type=attn_type))
300
+ up = nn.Module()
301
+ up.block = block
302
+ up.attn = attn
303
+ if i_level != 0:
304
+ up.upsample = Upsample(block_in, resamp_with_conv)
305
+ curr_res = curr_res * 2
306
+ self.up.insert(0, up) # prepend to get consistent order
307
+
308
+ # end
309
+ self.norm_out = Normalize(block_in)
310
+ self.conv_out = torch.nn.Conv2d(block_in,
311
+ out_ch,
312
+ kernel_size=3,
313
+ stride=1,
314
+ padding=1)
315
+
316
+ def forward(self, x, t=None, context=None):
317
+ #assert x.shape[2] == x.shape[3] == self.resolution
318
+ if context is not None:
319
+ # assume aligned context, cat along channel axis
320
+ x = torch.cat((x, context), dim=1)
321
+ if self.use_timestep:
322
+ # timestep embedding
323
+ assert t is not None
324
+ temb = get_timestep_embedding(t, self.ch)
325
+ temb = self.temb.dense[0](temb)
326
+ temb = nonlinearity(temb)
327
+ temb = self.temb.dense[1](temb)
328
+ else:
329
+ temb = None
330
+
331
+ # downsampling
332
+ hs = [self.conv_in(x)]
333
+ for i_level in range(self.num_resolutions):
334
+ for i_block in range(self.num_res_blocks):
335
+ h = self.down[i_level].block[i_block](hs[-1], temb)
336
+ if len(self.down[i_level].attn) > 0:
337
+ h = self.down[i_level].attn[i_block](h)
338
+ hs.append(h)
339
+ if i_level != self.num_resolutions-1:
340
+ hs.append(self.down[i_level].downsample(hs[-1]))
341
+
342
+ # middle
343
+ h = hs[-1]
344
+ h = self.mid.block_1(h, temb)
345
+ h = self.mid.attn_1(h)
346
+ h = self.mid.block_2(h, temb)
347
+
348
+ # upsampling
349
+ for i_level in reversed(range(self.num_resolutions)):
350
+ for i_block in range(self.num_res_blocks+1):
351
+ h = self.up[i_level].block[i_block](
352
+ torch.cat([h, hs.pop()], dim=1), temb)
353
+ if len(self.up[i_level].attn) > 0:
354
+ h = self.up[i_level].attn[i_block](h)
355
+ if i_level != 0:
356
+ h = self.up[i_level].upsample(h)
357
+
358
+ # end
359
+ h = self.norm_out(h)
360
+ h = nonlinearity(h)
361
+ h = self.conv_out(h)
362
+ return h
363
+
364
+ def get_last_layer(self):
365
+ return self.conv_out.weight
366
+
367
+
368
+ class Encoder(nn.Module):
369
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
370
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
371
+ resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
372
+ **ignore_kwargs):
373
+ super().__init__()
374
+ if use_linear_attn: attn_type = "linear"
375
+ self.ch = ch
376
+ self.temb_ch = 0
377
+ self.num_resolutions = len(ch_mult)
378
+ self.num_res_blocks = num_res_blocks
379
+ self.resolution = resolution
380
+ self.in_channels = in_channels
381
+
382
+ # downsampling
383
+ self.conv_in = torch.nn.Conv2d(in_channels,
384
+ self.ch,
385
+ kernel_size=3,
386
+ stride=1,
387
+ padding=1)
388
+
389
+ curr_res = resolution
390
+ in_ch_mult = (1,)+tuple(ch_mult)
391
+ self.in_ch_mult = in_ch_mult
392
+ self.down = nn.ModuleList()
393
+ for i_level in range(self.num_resolutions):
394
+ block = nn.ModuleList()
395
+ attn = nn.ModuleList()
396
+ block_in = ch*in_ch_mult[i_level]
397
+ block_out = ch*ch_mult[i_level]
398
+ for i_block in range(self.num_res_blocks):
399
+ block.append(ResnetBlock(in_channels=block_in,
400
+ out_channels=block_out,
401
+ temb_channels=self.temb_ch,
402
+ dropout=dropout))
403
+ block_in = block_out
404
+ if curr_res in attn_resolutions:
405
+ attn.append(make_attn(block_in, attn_type=attn_type))
406
+ down = nn.Module()
407
+ down.block = block
408
+ down.attn = attn
409
+ if i_level != self.num_resolutions-1:
410
+ down.downsample = Downsample(block_in, resamp_with_conv)
411
+ curr_res = curr_res // 2
412
+ self.down.append(down)
413
+
414
+ # middle
415
+ self.mid = nn.Module()
416
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
417
+ out_channels=block_in,
418
+ temb_channels=self.temb_ch,
419
+ dropout=dropout)
420
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
421
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
422
+ out_channels=block_in,
423
+ temb_channels=self.temb_ch,
424
+ dropout=dropout)
425
+
426
+ # end
427
+ self.norm_out = Normalize(block_in)
428
+ self.conv_out = torch.nn.Conv2d(block_in,
429
+ 2*z_channels if double_z else z_channels,
430
+ kernel_size=3,
431
+ stride=1,
432
+ padding=1)
433
+
434
+ def forward(self, x):
435
+ # timestep embedding
436
+ temb = None
437
+
438
+ # downsampling
439
+ hs = [self.conv_in(x)]
440
+ for i_level in range(self.num_resolutions):
441
+ for i_block in range(self.num_res_blocks):
442
+ h = self.down[i_level].block[i_block](hs[-1], temb)
443
+ if len(self.down[i_level].attn) > 0:
444
+ h = self.down[i_level].attn[i_block](h)
445
+ hs.append(h)
446
+ if i_level != self.num_resolutions-1:
447
+ hs.append(self.down[i_level].downsample(hs[-1]))
448
+
449
+ # middle
450
+ h = hs[-1]
451
+ h = self.mid.block_1(h, temb)
452
+ h = self.mid.attn_1(h)
453
+ h = self.mid.block_2(h, temb)
454
+
455
+ # end
456
+ h = self.norm_out(h)
457
+ h = nonlinearity(h)
458
+ h = self.conv_out(h)
459
+ return h
460
+
461
+
462
+ class Decoder(nn.Module):
463
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
464
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
465
+ resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
466
+ attn_type="vanilla", **ignorekwargs):
467
+ super().__init__()
468
+ if use_linear_attn: attn_type = "linear"
469
+ self.ch = ch
470
+ self.temb_ch = 0
471
+ self.num_resolutions = len(ch_mult)
472
+ self.num_res_blocks = num_res_blocks
473
+ self.resolution = resolution
474
+ self.in_channels = in_channels
475
+ self.give_pre_end = give_pre_end
476
+ self.tanh_out = tanh_out
477
+
478
+ # compute in_ch_mult, block_in and curr_res at lowest res
479
+ in_ch_mult = (1,)+tuple(ch_mult)
480
+ block_in = ch*ch_mult[self.num_resolutions-1]
481
+ curr_res = resolution // 2**(self.num_resolutions-1)
482
+ self.z_shape = (1,z_channels,curr_res,curr_res)
483
+ print("Working with z of shape {} = {} dimensions.".format(
484
+ self.z_shape, np.prod(self.z_shape)))
485
+
486
+ # z to block_in
487
+ self.conv_in = torch.nn.Conv2d(z_channels,
488
+ block_in,
489
+ kernel_size=3,
490
+ stride=1,
491
+ padding=1)
492
+
493
+ # middle
494
+ self.mid = nn.Module()
495
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
496
+ out_channels=block_in,
497
+ temb_channels=self.temb_ch,
498
+ dropout=dropout)
499
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
500
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
501
+ out_channels=block_in,
502
+ temb_channels=self.temb_ch,
503
+ dropout=dropout)
504
+
505
+ # upsampling
506
+ self.up = nn.ModuleList()
507
+ for i_level in reversed(range(self.num_resolutions)):
508
+ block = nn.ModuleList()
509
+ attn = nn.ModuleList()
510
+ block_out = ch*ch_mult[i_level]
511
+ for i_block in range(self.num_res_blocks+1):
512
+ block.append(ResnetBlock(in_channels=block_in,
513
+ out_channels=block_out,
514
+ temb_channels=self.temb_ch,
515
+ dropout=dropout))
516
+ block_in = block_out
517
+ if curr_res in attn_resolutions:
518
+ attn.append(make_attn(block_in, attn_type=attn_type))
519
+ up = nn.Module()
520
+ up.block = block
521
+ up.attn = attn
522
+ if i_level != 0:
523
+ up.upsample = Upsample(block_in, resamp_with_conv)
524
+ curr_res = curr_res * 2
525
+ self.up.insert(0, up) # prepend to get consistent order
526
+
527
+ # end
528
+ self.norm_out = Normalize(block_in)
529
+ self.conv_out = torch.nn.Conv2d(block_in,
530
+ out_ch,
531
+ kernel_size=3,
532
+ stride=1,
533
+ padding=1)
534
+
535
+ def forward(self, z):
536
+ #assert z.shape[1:] == self.z_shape[1:]
537
+ self.last_z_shape = z.shape
538
+
539
+ # timestep embedding
540
+ temb = None
541
+
542
+ # z to block_in
543
+ h = self.conv_in(z)
544
+
545
+ # middle
546
+ h = self.mid.block_1(h, temb)
547
+ h = self.mid.attn_1(h)
548
+ h = self.mid.block_2(h, temb)
549
+
550
+ # upsampling
551
+ for i_level in reversed(range(self.num_resolutions)):
552
+ for i_block in range(self.num_res_blocks+1):
553
+ h = self.up[i_level].block[i_block](h, temb)
554
+ if len(self.up[i_level].attn) > 0:
555
+ h = self.up[i_level].attn[i_block](h)
556
+ if i_level != 0:
557
+ h = self.up[i_level].upsample(h)
558
+
559
+ # end
560
+ if self.give_pre_end:
561
+ return h
562
+
563
+ h = self.norm_out(h)
564
+ h = nonlinearity(h)
565
+ h = self.conv_out(h)
566
+ if self.tanh_out:
567
+ h = torch.tanh(h)
568
+ return h
569
+
570
+
571
+ class SimpleDecoder(nn.Module):
572
+ def __init__(self, in_channels, out_channels, *args, **kwargs):
573
+ super().__init__()
574
+ self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
575
+ ResnetBlock(in_channels=in_channels,
576
+ out_channels=2 * in_channels,
577
+ temb_channels=0, dropout=0.0),
578
+ ResnetBlock(in_channels=2 * in_channels,
579
+ out_channels=4 * in_channels,
580
+ temb_channels=0, dropout=0.0),
581
+ ResnetBlock(in_channels=4 * in_channels,
582
+ out_channels=2 * in_channels,
583
+ temb_channels=0, dropout=0.0),
584
+ nn.Conv2d(2*in_channels, in_channels, 1),
585
+ Upsample(in_channels, with_conv=True)])
586
+ # end
587
+ self.norm_out = Normalize(in_channels)
588
+ self.conv_out = torch.nn.Conv2d(in_channels,
589
+ out_channels,
590
+ kernel_size=3,
591
+ stride=1,
592
+ padding=1)
593
+
594
+ def forward(self, x):
595
+ for i, layer in enumerate(self.model):
596
+ if i in [1,2,3]:
597
+ x = layer(x, None)
598
+ else:
599
+ x = layer(x)
600
+
601
+ h = self.norm_out(x)
602
+ h = nonlinearity(h)
603
+ x = self.conv_out(h)
604
+ return x
605
+
606
+
607
+ class UpsampleDecoder(nn.Module):
608
+ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
609
+ ch_mult=(2,2), dropout=0.0):
610
+ super().__init__()
611
+ # upsampling
612
+ self.temb_ch = 0
613
+ self.num_resolutions = len(ch_mult)
614
+ self.num_res_blocks = num_res_blocks
615
+ block_in = in_channels
616
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
617
+ self.res_blocks = nn.ModuleList()
618
+ self.upsample_blocks = nn.ModuleList()
619
+ for i_level in range(self.num_resolutions):
620
+ res_block = []
621
+ block_out = ch * ch_mult[i_level]
622
+ for i_block in range(self.num_res_blocks + 1):
623
+ res_block.append(ResnetBlock(in_channels=block_in,
624
+ out_channels=block_out,
625
+ temb_channels=self.temb_ch,
626
+ dropout=dropout))
627
+ block_in = block_out
628
+ self.res_blocks.append(nn.ModuleList(res_block))
629
+ if i_level != self.num_resolutions - 1:
630
+ self.upsample_blocks.append(Upsample(block_in, True))
631
+ curr_res = curr_res * 2
632
+
633
+ # end
634
+ self.norm_out = Normalize(block_in)
635
+ self.conv_out = torch.nn.Conv2d(block_in,
636
+ out_channels,
637
+ kernel_size=3,
638
+ stride=1,
639
+ padding=1)
640
+
641
+ def forward(self, x):
642
+ # upsampling
643
+ h = x
644
+ for k, i_level in enumerate(range(self.num_resolutions)):
645
+ for i_block in range(self.num_res_blocks + 1):
646
+ h = self.res_blocks[i_level][i_block](h, None)
647
+ if i_level != self.num_resolutions - 1:
648
+ h = self.upsample_blocks[k](h)
649
+ h = self.norm_out(h)
650
+ h = nonlinearity(h)
651
+ h = self.conv_out(h)
652
+ return h
653
+
654
+
655
+ class LatentRescaler(nn.Module):
656
+ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
657
+ super().__init__()
658
+ # residual block, interpolate, residual block
659
+ self.factor = factor
660
+ self.conv_in = nn.Conv2d(in_channels,
661
+ mid_channels,
662
+ kernel_size=3,
663
+ stride=1,
664
+ padding=1)
665
+ self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
666
+ out_channels=mid_channels,
667
+ temb_channels=0,
668
+ dropout=0.0) for _ in range(depth)])
669
+ self.attn = AttnBlock(mid_channels)
670
+ self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
671
+ out_channels=mid_channels,
672
+ temb_channels=0,
673
+ dropout=0.0) for _ in range(depth)])
674
+
675
+ self.conv_out = nn.Conv2d(mid_channels,
676
+ out_channels,
677
+ kernel_size=1,
678
+ )
679
+
680
+ def forward(self, x):
681
+ x = self.conv_in(x)
682
+ for block in self.res_block1:
683
+ x = block(x, None)
684
+ x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
685
+ x = self.attn(x)
686
+ for block in self.res_block2:
687
+ x = block(x, None)
688
+ x = self.conv_out(x)
689
+ return x
690
+
691
+
692
+ class MergedRescaleEncoder(nn.Module):
693
+ def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
694
+ attn_resolutions, dropout=0.0, resamp_with_conv=True,
695
+ ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
696
+ super().__init__()
697
+ intermediate_chn = ch * ch_mult[-1]
698
+ self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
699
+ z_channels=intermediate_chn, double_z=False, resolution=resolution,
700
+ attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
701
+ out_ch=None)
702
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
703
+ mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
704
+
705
+ def forward(self, x):
706
+ x = self.encoder(x)
707
+ x = self.rescaler(x)
708
+ return x
709
+
710
+
711
+ class MergedRescaleDecoder(nn.Module):
712
+ def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
713
+ dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
714
+ super().__init__()
715
+ tmp_chn = z_channels*ch_mult[-1]
716
+ self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
717
+ resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
718
+ ch_mult=ch_mult, resolution=resolution, ch=ch)
719
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
720
+ out_channels=tmp_chn, depth=rescale_module_depth)
721
+
722
+ def forward(self, x):
723
+ x = self.rescaler(x)
724
+ x = self.decoder(x)
725
+ return x
726
+
727
+
728
+ class Upsampler(nn.Module):
729
+ def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
730
+ super().__init__()
731
+ assert out_size >= in_size
732
+ num_blocks = int(np.log2(out_size//in_size))+1
733
+ factor_up = 1.+ (out_size % in_size)
734
+ print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
735
+ self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
736
+ out_channels=in_channels)
737
+ self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
738
+ attn_resolutions=[], in_channels=None, ch=in_channels,
739
+ ch_mult=[ch_mult for _ in range(num_blocks)])
740
+
741
+ def forward(self, x):
742
+ x = self.rescaler(x)
743
+ x = self.decoder(x)
744
+ return x
745
+
746
+
747
+ class Resize(nn.Module):
748
+ def __init__(self, in_channels=None, learned=False, mode="bilinear"):
749
+ super().__init__()
750
+ self.with_conv = learned
751
+ self.mode = mode
752
+ if self.with_conv:
753
+ print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
754
+ raise NotImplementedError()
755
+ assert in_channels is not None
756
+ # no asymmetric padding in torch conv, must do it ourselves
757
+ self.conv = torch.nn.Conv2d(in_channels,
758
+ in_channels,
759
+ kernel_size=4,
760
+ stride=2,
761
+ padding=1)
762
+
763
+ def forward(self, x, scale_factor=1.0):
764
+ if scale_factor==1.0:
765
+ return x
766
+ else:
767
+ x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
768
+ return x
769
+
770
+ class FirstStagePostProcessor(nn.Module):
771
+
772
+ def __init__(self, ch_mult:list, in_channels,
773
+ pretrained_model:nn.Module=None,
774
+ reshape=False,
775
+ n_channels=None,
776
+ dropout=0.,
777
+ pretrained_config=None):
778
+ super().__init__()
779
+ if pretrained_config is None:
780
+ assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
781
+ self.pretrained_model = pretrained_model
782
+ else:
783
+ assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
784
+ self.instantiate_pretrained(pretrained_config)
785
+
786
+ self.do_reshape = reshape
787
+
788
+ if n_channels is None:
789
+ n_channels = self.pretrained_model.encoder.ch
790
+
791
+ self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
792
+ self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
793
+ stride=1,padding=1)
794
+
795
+ blocks = []
796
+ downs = []
797
+ ch_in = n_channels
798
+ for m in ch_mult:
799
+ blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
800
+ ch_in = m * n_channels
801
+ downs.append(Downsample(ch_in, with_conv=False))
802
+
803
+ self.model = nn.ModuleList(blocks)
804
+ self.downsampler = nn.ModuleList(downs)
805
+
806
+
807
+ def instantiate_pretrained(self, config):
808
+ model = instantiate_from_config(config)
809
+ self.pretrained_model = model.eval()
810
+ # self.pretrained_model.train = False
811
+ for param in self.pretrained_model.parameters():
812
+ param.requires_grad = False
813
+
814
+
815
+ @torch.no_grad()
816
+ def encode_with_pretrained(self,x):
817
+ c = self.pretrained_model.encode(x)
818
+ if isinstance(c, DiagonalGaussianDistribution):
819
+ c = c.mode()
820
+ return c
821
+
822
+ def forward(self,x):
823
+ z_fs = self.encode_with_pretrained(x)
824
+ z = self.proj_norm(z_fs)
825
+ z = self.proj(z)
826
+ z = nonlinearity(z)
827
+
828
+ for submodel, downmodel in zip(self.model,self.downsampler):
829
+ z = submodel(z,temb=None)
830
+ z = downmodel(z)
831
+
832
+ if self.do_reshape:
833
+ z = rearrange(z,'b c h w -> b (h w) c')
834
+ return z
835
+
ldm/modules/diffusionmodules/openaimodel.py ADDED
@@ -0,0 +1,996 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from functools import partial
3
+ import math
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ import torch as th
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+ from ldm.modules.diffusionmodules.util import (
12
+ checkpoint,
13
+ conv_nd,
14
+ linear,
15
+ avg_pool_nd,
16
+ zero_module,
17
+ normalization,
18
+ timestep_embedding,
19
+ )
20
+ from ldm.modules.attention import SpatialTransformer
21
+ from ldm.util import exists
22
+
23
+
24
+ # dummy replace
25
+ def convert_module_to_f16(x):
26
+ pass
27
+
28
+ def convert_module_to_f32(x):
29
+ pass
30
+
31
+
32
+ ## go
33
+ class AttentionPool2d(nn.Module):
34
+ """
35
+ Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ spacial_dim: int,
41
+ embed_dim: int,
42
+ num_heads_channels: int,
43
+ output_dim: int = None,
44
+ ):
45
+ super().__init__()
46
+ self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
47
+ self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
48
+ self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
49
+ self.num_heads = embed_dim // num_heads_channels
50
+ self.attention = QKVAttention(self.num_heads)
51
+
52
+ def forward(self, x):
53
+ b, c, *_spatial = x.shape
54
+ x = x.reshape(b, c, -1) # NC(HW)
55
+ x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
56
+ x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
57
+ x = self.qkv_proj(x)
58
+ x = self.attention(x)
59
+ x = self.c_proj(x)
60
+ return x[:, :, 0]
61
+
62
+
63
+ class TimestepBlock(nn.Module):
64
+ """
65
+ Any module where forward() takes timestep embeddings as a second argument.
66
+ """
67
+
68
+ @abstractmethod
69
+ def forward(self, x, emb):
70
+ """
71
+ Apply the module to `x` given `emb` timestep embeddings.
72
+ """
73
+
74
+
75
+ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
76
+ """
77
+ A sequential module that passes timestep embeddings to the children that
78
+ support it as an extra input.
79
+ """
80
+
81
+ def forward(self, x, emb, context=None):
82
+ for layer in self:
83
+ if isinstance(layer, TimestepBlock):
84
+ x = layer(x, emb)
85
+ elif isinstance(layer, SpatialTransformer):
86
+ x = layer(x, context)
87
+ else:
88
+ x = layer(x)
89
+ return x
90
+
91
+
92
+ class Upsample(nn.Module):
93
+ """
94
+ An upsampling layer with an optional convolution.
95
+ :param channels: channels in the inputs and outputs.
96
+ :param use_conv: a bool determining if a convolution is applied.
97
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
98
+ upsampling occurs in the inner-two dimensions.
99
+ """
100
+
101
+ def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
102
+ super().__init__()
103
+ self.channels = channels
104
+ self.out_channels = out_channels or channels
105
+ self.use_conv = use_conv
106
+ self.dims = dims
107
+ if use_conv:
108
+ self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
109
+
110
+ def forward(self, x):
111
+ assert x.shape[1] == self.channels
112
+ if self.dims == 3:
113
+ x = F.interpolate(
114
+ x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
115
+ )
116
+ else:
117
+ x = F.interpolate(x, scale_factor=2, mode="nearest")
118
+ if self.use_conv:
119
+ x = self.conv(x)
120
+ return x
121
+
122
+ class TransposedUpsample(nn.Module):
123
+ 'Learned 2x upsampling without padding'
124
+ def __init__(self, channels, out_channels=None, ks=5):
125
+ super().__init__()
126
+ self.channels = channels
127
+ self.out_channels = out_channels or channels
128
+
129
+ self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
130
+
131
+ def forward(self,x):
132
+ return self.up(x)
133
+
134
+
135
+ class Downsample(nn.Module):
136
+ """
137
+ A downsampling layer with an optional convolution.
138
+ :param channels: channels in the inputs and outputs.
139
+ :param use_conv: a bool determining if a convolution is applied.
140
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
141
+ downsampling occurs in the inner-two dimensions.
142
+ """
143
+
144
+ def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
145
+ super().__init__()
146
+ self.channels = channels
147
+ self.out_channels = out_channels or channels
148
+ self.use_conv = use_conv
149
+ self.dims = dims
150
+ stride = 2 if dims != 3 else (1, 2, 2)
151
+ if use_conv:
152
+ self.op = conv_nd(
153
+ dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
154
+ )
155
+ else:
156
+ assert self.channels == self.out_channels
157
+ self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
158
+
159
+ def forward(self, x):
160
+ assert x.shape[1] == self.channels
161
+ return self.op(x)
162
+
163
+
164
+ class ResBlock(TimestepBlock):
165
+ """
166
+ A residual block that can optionally change the number of channels.
167
+ :param channels: the number of input channels.
168
+ :param emb_channels: the number of timestep embedding channels.
169
+ :param dropout: the rate of dropout.
170
+ :param out_channels: if specified, the number of out channels.
171
+ :param use_conv: if True and out_channels is specified, use a spatial
172
+ convolution instead of a smaller 1x1 convolution to change the
173
+ channels in the skip connection.
174
+ :param dims: determines if the signal is 1D, 2D, or 3D.
175
+ :param use_checkpoint: if True, use gradient checkpointing on this module.
176
+ :param up: if True, use this block for upsampling.
177
+ :param down: if True, use this block for downsampling.
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ channels,
183
+ emb_channels,
184
+ dropout,
185
+ out_channels=None,
186
+ use_conv=False,
187
+ use_scale_shift_norm=False,
188
+ dims=2,
189
+ use_checkpoint=False,
190
+ up=False,
191
+ down=False,
192
+ ):
193
+ super().__init__()
194
+ self.channels = channels
195
+ self.emb_channels = emb_channels
196
+ self.dropout = dropout
197
+ self.out_channels = out_channels or channels
198
+ self.use_conv = use_conv
199
+ self.use_checkpoint = use_checkpoint
200
+ self.use_scale_shift_norm = use_scale_shift_norm
201
+
202
+ self.in_layers = nn.Sequential(
203
+ normalization(channels),
204
+ nn.SiLU(),
205
+ conv_nd(dims, channels, self.out_channels, 3, padding=1),
206
+ )
207
+
208
+ self.updown = up or down
209
+
210
+ if up:
211
+ self.h_upd = Upsample(channels, False, dims)
212
+ self.x_upd = Upsample(channels, False, dims)
213
+ elif down:
214
+ self.h_upd = Downsample(channels, False, dims)
215
+ self.x_upd = Downsample(channels, False, dims)
216
+ else:
217
+ self.h_upd = self.x_upd = nn.Identity()
218
+
219
+ self.emb_layers = nn.Sequential(
220
+ nn.SiLU(),
221
+ linear(
222
+ emb_channels,
223
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
224
+ ),
225
+ )
226
+ self.out_layers = nn.Sequential(
227
+ normalization(self.out_channels),
228
+ nn.SiLU(),
229
+ nn.Dropout(p=dropout),
230
+ zero_module(
231
+ conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
232
+ ),
233
+ )
234
+
235
+ if self.out_channels == channels:
236
+ self.skip_connection = nn.Identity()
237
+ elif use_conv:
238
+ self.skip_connection = conv_nd(
239
+ dims, channels, self.out_channels, 3, padding=1
240
+ )
241
+ else:
242
+ self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
243
+
244
+ def forward(self, x, emb):
245
+ """
246
+ Apply the block to a Tensor, conditioned on a timestep embedding.
247
+ :param x: an [N x C x ...] Tensor of features.
248
+ :param emb: an [N x emb_channels] Tensor of timestep embeddings.
249
+ :return: an [N x C x ...] Tensor of outputs.
250
+ """
251
+ return checkpoint(
252
+ self._forward, (x, emb), self.parameters(), self.use_checkpoint
253
+ )
254
+
255
+
256
+ def _forward(self, x, emb):
257
+ if self.updown:
258
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
259
+ h = in_rest(x)
260
+ h = self.h_upd(h)
261
+ x = self.x_upd(x)
262
+ h = in_conv(h)
263
+ else:
264
+ h = self.in_layers(x)
265
+ emb_out = self.emb_layers(emb).type(h.dtype)
266
+ while len(emb_out.shape) < len(h.shape):
267
+ emb_out = emb_out[..., None]
268
+ if self.use_scale_shift_norm:
269
+ out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
270
+ scale, shift = th.chunk(emb_out, 2, dim=1)
271
+ h = out_norm(h) * (1 + scale) + shift
272
+ h = out_rest(h)
273
+ else:
274
+ h = h + emb_out
275
+ h = self.out_layers(h)
276
+ return self.skip_connection(x) + h
277
+
278
+
279
+ class AttentionBlock(nn.Module):
280
+ """
281
+ An attention block that allows spatial positions to attend to each other.
282
+ Originally ported from here, but adapted to the N-d case.
283
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
284
+ """
285
+
286
+ def __init__(
287
+ self,
288
+ channels,
289
+ num_heads=1,
290
+ num_head_channels=-1,
291
+ use_checkpoint=False,
292
+ use_new_attention_order=False,
293
+ ):
294
+ super().__init__()
295
+ self.channels = channels
296
+ if num_head_channels == -1:
297
+ self.num_heads = num_heads
298
+ else:
299
+ assert (
300
+ channels % num_head_channels == 0
301
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
302
+ self.num_heads = channels // num_head_channels
303
+ self.use_checkpoint = use_checkpoint
304
+ self.norm = normalization(channels)
305
+ self.qkv = conv_nd(1, channels, channels * 3, 1)
306
+ if use_new_attention_order:
307
+ # split qkv before split heads
308
+ self.attention = QKVAttention(self.num_heads)
309
+ else:
310
+ # split heads before split qkv
311
+ self.attention = QKVAttentionLegacy(self.num_heads)
312
+
313
+ self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
314
+
315
+ def forward(self, x):
316
+ return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
317
+ #return pt_checkpoint(self._forward, x) # pytorch
318
+
319
+ def _forward(self, x):
320
+ b, c, *spatial = x.shape
321
+ x = x.reshape(b, c, -1)
322
+ qkv = self.qkv(self.norm(x))
323
+ h = self.attention(qkv)
324
+ h = self.proj_out(h)
325
+ return (x + h).reshape(b, c, *spatial)
326
+
327
+
328
+ def count_flops_attn(model, _x, y):
329
+ """
330
+ A counter for the `thop` package to count the operations in an
331
+ attention operation.
332
+ Meant to be used like:
333
+ macs, params = thop.profile(
334
+ model,
335
+ inputs=(inputs, timestamps),
336
+ custom_ops={QKVAttention: QKVAttention.count_flops},
337
+ )
338
+ """
339
+ b, c, *spatial = y[0].shape
340
+ num_spatial = int(np.prod(spatial))
341
+ # We perform two matmuls with the same number of ops.
342
+ # The first computes the weight matrix, the second computes
343
+ # the combination of the value vectors.
344
+ matmul_ops = 2 * b * (num_spatial ** 2) * c
345
+ model.total_ops += th.DoubleTensor([matmul_ops])
346
+
347
+
348
+ class QKVAttentionLegacy(nn.Module):
349
+ """
350
+ A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
351
+ """
352
+
353
+ def __init__(self, n_heads):
354
+ super().__init__()
355
+ self.n_heads = n_heads
356
+
357
+ def forward(self, qkv):
358
+ """
359
+ Apply QKV attention.
360
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
361
+ :return: an [N x (H * C) x T] tensor after attention.
362
+ """
363
+ bs, width, length = qkv.shape
364
+ assert width % (3 * self.n_heads) == 0
365
+ ch = width // (3 * self.n_heads)
366
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
367
+ scale = 1 / math.sqrt(math.sqrt(ch))
368
+ weight = th.einsum(
369
+ "bct,bcs->bts", q * scale, k * scale
370
+ ) # More stable with f16 than dividing afterwards
371
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
372
+ a = th.einsum("bts,bcs->bct", weight, v)
373
+ return a.reshape(bs, -1, length)
374
+
375
+ @staticmethod
376
+ def count_flops(model, _x, y):
377
+ return count_flops_attn(model, _x, y)
378
+
379
+
380
+ class QKVAttention(nn.Module):
381
+ """
382
+ A module which performs QKV attention and splits in a different order.
383
+ """
384
+
385
+ def __init__(self, n_heads):
386
+ super().__init__()
387
+ self.n_heads = n_heads
388
+
389
+ def forward(self, qkv):
390
+ """
391
+ Apply QKV attention.
392
+ :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
393
+ :return: an [N x (H * C) x T] tensor after attention.
394
+ """
395
+ bs, width, length = qkv.shape
396
+ assert width % (3 * self.n_heads) == 0
397
+ ch = width // (3 * self.n_heads)
398
+ q, k, v = qkv.chunk(3, dim=1)
399
+ scale = 1 / math.sqrt(math.sqrt(ch))
400
+ weight = th.einsum(
401
+ "bct,bcs->bts",
402
+ (q * scale).view(bs * self.n_heads, ch, length),
403
+ (k * scale).view(bs * self.n_heads, ch, length),
404
+ ) # More stable with f16 than dividing afterwards
405
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
406
+ a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
407
+ return a.reshape(bs, -1, length)
408
+
409
+ @staticmethod
410
+ def count_flops(model, _x, y):
411
+ return count_flops_attn(model, _x, y)
412
+
413
+
414
+ class UNetModel(nn.Module):
415
+ """
416
+ The full UNet model with attention and timestep embedding.
417
+ :param in_channels: channels in the input Tensor.
418
+ :param model_channels: base channel count for the model.
419
+ :param out_channels: channels in the output Tensor.
420
+ :param num_res_blocks: number of residual blocks per downsample.
421
+ :param attention_resolutions: a collection of downsample rates at which
422
+ attention will take place. May be a set, list, or tuple.
423
+ For example, if this contains 4, then at 4x downsampling, attention
424
+ will be used.
425
+ :param dropout: the dropout probability.
426
+ :param channel_mult: channel multiplier for each level of the UNet.
427
+ :param conv_resample: if True, use learned convolutions for upsampling and
428
+ downsampling.
429
+ :param dims: determines if the signal is 1D, 2D, or 3D.
430
+ :param num_classes: if specified (as an int), then this model will be
431
+ class-conditional with `num_classes` classes.
432
+ :param use_checkpoint: use gradient checkpointing to reduce memory usage.
433
+ :param num_heads: the number of attention heads in each attention layer.
434
+ :param num_heads_channels: if specified, ignore num_heads and instead use
435
+ a fixed channel width per attention head.
436
+ :param num_heads_upsample: works with num_heads to set a different number
437
+ of heads for upsampling. Deprecated.
438
+ :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
439
+ :param resblock_updown: use residual blocks for up/downsampling.
440
+ :param use_new_attention_order: use a different attention pattern for potentially
441
+ increased efficiency.
442
+ """
443
+
444
+ def __init__(
445
+ self,
446
+ image_size,
447
+ in_channels,
448
+ model_channels,
449
+ out_channels,
450
+ num_res_blocks,
451
+ attention_resolutions,
452
+ dropout=0,
453
+ channel_mult=(1, 2, 4, 8),
454
+ conv_resample=True,
455
+ dims=2,
456
+ num_classes=None,
457
+ use_checkpoint=False,
458
+ use_fp16=False,
459
+ num_heads=-1,
460
+ num_head_channels=-1,
461
+ num_heads_upsample=-1,
462
+ use_scale_shift_norm=False,
463
+ resblock_updown=False,
464
+ use_new_attention_order=False,
465
+ use_spatial_transformer=False, # custom transformer support
466
+ transformer_depth=1, # custom transformer support
467
+ context_dim=None, # custom transformer support
468
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
469
+ legacy=True,
470
+ disable_self_attentions=None,
471
+ num_attention_blocks=None
472
+ ):
473
+ super().__init__()
474
+ if use_spatial_transformer:
475
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
476
+
477
+ if context_dim is not None:
478
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
479
+ from omegaconf.listconfig import ListConfig
480
+ if type(context_dim) == ListConfig:
481
+ context_dim = list(context_dim)
482
+
483
+ if num_heads_upsample == -1:
484
+ num_heads_upsample = num_heads
485
+
486
+ if num_heads == -1:
487
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
488
+
489
+ if num_head_channels == -1:
490
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
491
+
492
+ self.image_size = image_size
493
+ self.in_channels = in_channels
494
+ self.model_channels = model_channels
495
+ self.out_channels = out_channels
496
+ if isinstance(num_res_blocks, int):
497
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
498
+ else:
499
+ if len(num_res_blocks) != len(channel_mult):
500
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
501
+ "as a list/tuple (per-level) with the same length as channel_mult")
502
+ self.num_res_blocks = num_res_blocks
503
+ #self.num_res_blocks = num_res_blocks
504
+ if disable_self_attentions is not None:
505
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
506
+ assert len(disable_self_attentions) == len(channel_mult)
507
+ if num_attention_blocks is not None:
508
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
509
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
510
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
511
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
512
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
513
+ f"attention will still not be set.") # todo: convert to warning
514
+
515
+ self.attention_resolutions = attention_resolutions
516
+ self.dropout = dropout
517
+ self.channel_mult = channel_mult
518
+ self.conv_resample = conv_resample
519
+ self.num_classes = num_classes
520
+ self.use_checkpoint = use_checkpoint
521
+ self.dtype = th.float16 if use_fp16 else th.float32
522
+ self.num_heads = num_heads
523
+ self.num_head_channels = num_head_channels
524
+ self.num_heads_upsample = num_heads_upsample
525
+ self.predict_codebook_ids = n_embed is not None
526
+
527
+ time_embed_dim = model_channels * 4
528
+ self.time_embed = nn.Sequential(
529
+ linear(model_channels, time_embed_dim),
530
+ nn.SiLU(),
531
+ linear(time_embed_dim, time_embed_dim),
532
+ )
533
+
534
+ if self.num_classes is not None:
535
+ self.label_emb = nn.Embedding(num_classes, time_embed_dim)
536
+
537
+ self.input_blocks = nn.ModuleList(
538
+ [
539
+ TimestepEmbedSequential(
540
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
541
+ )
542
+ ]
543
+ )
544
+ self._feature_size = model_channels
545
+ input_block_chans = [model_channels]
546
+ ch = model_channels
547
+ ds = 1
548
+ for level, mult in enumerate(channel_mult):
549
+ for nr in range(self.num_res_blocks[level]):
550
+ layers = [
551
+ ResBlock(
552
+ ch,
553
+ time_embed_dim,
554
+ dropout,
555
+ out_channels=mult * model_channels,
556
+ dims=dims,
557
+ use_checkpoint=use_checkpoint,
558
+ use_scale_shift_norm=use_scale_shift_norm,
559
+ )
560
+ ]
561
+ ch = mult * model_channels
562
+ if ds in attention_resolutions:
563
+ if num_head_channels == -1:
564
+ dim_head = ch // num_heads
565
+ else:
566
+ num_heads = ch // num_head_channels
567
+ dim_head = num_head_channels
568
+ if legacy:
569
+ #num_heads = 1
570
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
571
+ if exists(disable_self_attentions):
572
+ disabled_sa = disable_self_attentions[level]
573
+ else:
574
+ disabled_sa = False
575
+
576
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
577
+ layers.append(
578
+ AttentionBlock(
579
+ ch,
580
+ use_checkpoint=use_checkpoint,
581
+ num_heads=num_heads,
582
+ num_head_channels=dim_head,
583
+ use_new_attention_order=use_new_attention_order,
584
+ ) if not use_spatial_transformer else SpatialTransformer(
585
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
586
+ disable_self_attn=disabled_sa
587
+ )
588
+ )
589
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
590
+ self._feature_size += ch
591
+ input_block_chans.append(ch)
592
+ if level != len(channel_mult) - 1:
593
+ out_ch = ch
594
+ self.input_blocks.append(
595
+ TimestepEmbedSequential(
596
+ ResBlock(
597
+ ch,
598
+ time_embed_dim,
599
+ dropout,
600
+ out_channels=out_ch,
601
+ dims=dims,
602
+ use_checkpoint=use_checkpoint,
603
+ use_scale_shift_norm=use_scale_shift_norm,
604
+ down=True,
605
+ )
606
+ if resblock_updown
607
+ else Downsample(
608
+ ch, conv_resample, dims=dims, out_channels=out_ch
609
+ )
610
+ )
611
+ )
612
+ ch = out_ch
613
+ input_block_chans.append(ch)
614
+ ds *= 2
615
+ self._feature_size += ch
616
+
617
+ if num_head_channels == -1:
618
+ dim_head = ch // num_heads
619
+ else:
620
+ num_heads = ch // num_head_channels
621
+ dim_head = num_head_channels
622
+ if legacy:
623
+ #num_heads = 1
624
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
625
+ self.middle_block = TimestepEmbedSequential(
626
+ ResBlock(
627
+ ch,
628
+ time_embed_dim,
629
+ dropout,
630
+ dims=dims,
631
+ use_checkpoint=use_checkpoint,
632
+ use_scale_shift_norm=use_scale_shift_norm,
633
+ ),
634
+ AttentionBlock(
635
+ ch,
636
+ use_checkpoint=use_checkpoint,
637
+ num_heads=num_heads,
638
+ num_head_channels=dim_head,
639
+ use_new_attention_order=use_new_attention_order,
640
+ ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
641
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
642
+ ),
643
+ ResBlock(
644
+ ch,
645
+ time_embed_dim,
646
+ dropout,
647
+ dims=dims,
648
+ use_checkpoint=use_checkpoint,
649
+ use_scale_shift_norm=use_scale_shift_norm,
650
+ ),
651
+ )
652
+ self._feature_size += ch
653
+
654
+ self.output_blocks = nn.ModuleList([])
655
+ for level, mult in list(enumerate(channel_mult))[::-1]:
656
+ for i in range(self.num_res_blocks[level] + 1):
657
+ ich = input_block_chans.pop()
658
+ layers = [
659
+ ResBlock(
660
+ ch + ich,
661
+ time_embed_dim,
662
+ dropout,
663
+ out_channels=model_channels * mult,
664
+ dims=dims,
665
+ use_checkpoint=use_checkpoint,
666
+ use_scale_shift_norm=use_scale_shift_norm,
667
+ )
668
+ ]
669
+ ch = model_channels * mult
670
+ if ds in attention_resolutions:
671
+ if num_head_channels == -1:
672
+ dim_head = ch // num_heads
673
+ else:
674
+ num_heads = ch // num_head_channels
675
+ dim_head = num_head_channels
676
+ if legacy:
677
+ #num_heads = 1
678
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
679
+ if exists(disable_self_attentions):
680
+ disabled_sa = disable_self_attentions[level]
681
+ else:
682
+ disabled_sa = False
683
+
684
+ if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
685
+ layers.append(
686
+ AttentionBlock(
687
+ ch,
688
+ use_checkpoint=use_checkpoint,
689
+ num_heads=num_heads_upsample,
690
+ num_head_channels=dim_head,
691
+ use_new_attention_order=use_new_attention_order,
692
+ ) if not use_spatial_transformer else SpatialTransformer(
693
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
694
+ disable_self_attn=disabled_sa
695
+ )
696
+ )
697
+ if level and i == self.num_res_blocks[level]:
698
+ out_ch = ch
699
+ layers.append(
700
+ ResBlock(
701
+ ch,
702
+ time_embed_dim,
703
+ dropout,
704
+ out_channels=out_ch,
705
+ dims=dims,
706
+ use_checkpoint=use_checkpoint,
707
+ use_scale_shift_norm=use_scale_shift_norm,
708
+ up=True,
709
+ )
710
+ if resblock_updown
711
+ else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
712
+ )
713
+ ds //= 2
714
+ self.output_blocks.append(TimestepEmbedSequential(*layers))
715
+ self._feature_size += ch
716
+
717
+ self.out = nn.Sequential(
718
+ normalization(ch),
719
+ nn.SiLU(),
720
+ zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
721
+ )
722
+ if self.predict_codebook_ids:
723
+ self.id_predictor = nn.Sequential(
724
+ normalization(ch),
725
+ conv_nd(dims, model_channels, n_embed, 1),
726
+ #nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
727
+ )
728
+
729
+ def convert_to_fp16(self):
730
+ """
731
+ Convert the torso of the model to float16.
732
+ """
733
+ self.input_blocks.apply(convert_module_to_f16)
734
+ self.middle_block.apply(convert_module_to_f16)
735
+ self.output_blocks.apply(convert_module_to_f16)
736
+
737
+ def convert_to_fp32(self):
738
+ """
739
+ Convert the torso of the model to float32.
740
+ """
741
+ self.input_blocks.apply(convert_module_to_f32)
742
+ self.middle_block.apply(convert_module_to_f32)
743
+ self.output_blocks.apply(convert_module_to_f32)
744
+
745
+ def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
746
+ """
747
+ Apply the model to an input batch.
748
+ :param x: an [N x C x ...] Tensor of inputs.
749
+ :param timesteps: a 1-D batch of timesteps.
750
+ :param context: conditioning plugged in via crossattn
751
+ :param y: an [N] Tensor of labels, if class-conditional.
752
+ :return: an [N x C x ...] Tensor of outputs.
753
+ """
754
+ assert (y is not None) == (
755
+ self.num_classes is not None
756
+ ), "must specify y if and only if the model is class-conditional"
757
+ hs = []
758
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
759
+ emb = self.time_embed(t_emb)
760
+
761
+ if self.num_classes is not None:
762
+ assert y.shape == (x.shape[0],)
763
+ emb = emb + self.label_emb(y)
764
+
765
+ h = x.type(self.dtype)
766
+ for module in self.input_blocks:
767
+ h = module(h, emb, context)
768
+ hs.append(h)
769
+ h = self.middle_block(h, emb, context)
770
+ for module in self.output_blocks:
771
+ h = th.cat([h, hs.pop()], dim=1)
772
+ h = module(h, emb, context)
773
+ h = h.type(x.dtype)
774
+ if self.predict_codebook_ids:
775
+ return self.id_predictor(h)
776
+ else:
777
+ return self.out(h)
778
+
779
+
780
+ class EncoderUNetModel(nn.Module):
781
+ """
782
+ The half UNet model with attention and timestep embedding.
783
+ For usage, see UNet.
784
+ """
785
+
786
+ def __init__(
787
+ self,
788
+ image_size,
789
+ in_channels,
790
+ model_channels,
791
+ out_channels,
792
+ num_res_blocks,
793
+ attention_resolutions,
794
+ dropout=0,
795
+ channel_mult=(1, 2, 4, 8),
796
+ conv_resample=True,
797
+ dims=2,
798
+ use_checkpoint=False,
799
+ use_fp16=False,
800
+ num_heads=1,
801
+ num_head_channels=-1,
802
+ num_heads_upsample=-1,
803
+ use_scale_shift_norm=False,
804
+ resblock_updown=False,
805
+ use_new_attention_order=False,
806
+ pool="adaptive",
807
+ *args,
808
+ **kwargs
809
+ ):
810
+ super().__init__()
811
+
812
+ if num_heads_upsample == -1:
813
+ num_heads_upsample = num_heads
814
+
815
+ self.in_channels = in_channels
816
+ self.model_channels = model_channels
817
+ self.out_channels = out_channels
818
+ self.num_res_blocks = num_res_blocks
819
+ self.attention_resolutions = attention_resolutions
820
+ self.dropout = dropout
821
+ self.channel_mult = channel_mult
822
+ self.conv_resample = conv_resample
823
+ self.use_checkpoint = use_checkpoint
824
+ self.dtype = th.float16 if use_fp16 else th.float32
825
+ self.num_heads = num_heads
826
+ self.num_head_channels = num_head_channels
827
+ self.num_heads_upsample = num_heads_upsample
828
+
829
+ time_embed_dim = model_channels * 4
830
+ self.time_embed = nn.Sequential(
831
+ linear(model_channels, time_embed_dim),
832
+ nn.SiLU(),
833
+ linear(time_embed_dim, time_embed_dim),
834
+ )
835
+
836
+ self.input_blocks = nn.ModuleList(
837
+ [
838
+ TimestepEmbedSequential(
839
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
840
+ )
841
+ ]
842
+ )
843
+ self._feature_size = model_channels
844
+ input_block_chans = [model_channels]
845
+ ch = model_channels
846
+ ds = 1
847
+ for level, mult in enumerate(channel_mult):
848
+ for _ in range(num_res_blocks):
849
+ layers = [
850
+ ResBlock(
851
+ ch,
852
+ time_embed_dim,
853
+ dropout,
854
+ out_channels=mult * model_channels,
855
+ dims=dims,
856
+ use_checkpoint=use_checkpoint,
857
+ use_scale_shift_norm=use_scale_shift_norm,
858
+ )
859
+ ]
860
+ ch = mult * model_channels
861
+ if ds in attention_resolutions:
862
+ layers.append(
863
+ AttentionBlock(
864
+ ch,
865
+ use_checkpoint=use_checkpoint,
866
+ num_heads=num_heads,
867
+ num_head_channels=num_head_channels,
868
+ use_new_attention_order=use_new_attention_order,
869
+ )
870
+ )
871
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
872
+ self._feature_size += ch
873
+ input_block_chans.append(ch)
874
+ if level != len(channel_mult) - 1:
875
+ out_ch = ch
876
+ self.input_blocks.append(
877
+ TimestepEmbedSequential(
878
+ ResBlock(
879
+ ch,
880
+ time_embed_dim,
881
+ dropout,
882
+ out_channels=out_ch,
883
+ dims=dims,
884
+ use_checkpoint=use_checkpoint,
885
+ use_scale_shift_norm=use_scale_shift_norm,
886
+ down=True,
887
+ )
888
+ if resblock_updown
889
+ else Downsample(
890
+ ch, conv_resample, dims=dims, out_channels=out_ch
891
+ )
892
+ )
893
+ )
894
+ ch = out_ch
895
+ input_block_chans.append(ch)
896
+ ds *= 2
897
+ self._feature_size += ch
898
+
899
+ self.middle_block = TimestepEmbedSequential(
900
+ ResBlock(
901
+ ch,
902
+ time_embed_dim,
903
+ dropout,
904
+ dims=dims,
905
+ use_checkpoint=use_checkpoint,
906
+ use_scale_shift_norm=use_scale_shift_norm,
907
+ ),
908
+ AttentionBlock(
909
+ ch,
910
+ use_checkpoint=use_checkpoint,
911
+ num_heads=num_heads,
912
+ num_head_channels=num_head_channels,
913
+ use_new_attention_order=use_new_attention_order,
914
+ ),
915
+ ResBlock(
916
+ ch,
917
+ time_embed_dim,
918
+ dropout,
919
+ dims=dims,
920
+ use_checkpoint=use_checkpoint,
921
+ use_scale_shift_norm=use_scale_shift_norm,
922
+ ),
923
+ )
924
+ self._feature_size += ch
925
+ self.pool = pool
926
+ if pool == "adaptive":
927
+ self.out = nn.Sequential(
928
+ normalization(ch),
929
+ nn.SiLU(),
930
+ nn.AdaptiveAvgPool2d((1, 1)),
931
+ zero_module(conv_nd(dims, ch, out_channels, 1)),
932
+ nn.Flatten(),
933
+ )
934
+ elif pool == "attention":
935
+ assert num_head_channels != -1
936
+ self.out = nn.Sequential(
937
+ normalization(ch),
938
+ nn.SiLU(),
939
+ AttentionPool2d(
940
+ (image_size // ds), ch, num_head_channels, out_channels
941
+ ),
942
+ )
943
+ elif pool == "spatial":
944
+ self.out = nn.Sequential(
945
+ nn.Linear(self._feature_size, 2048),
946
+ nn.ReLU(),
947
+ nn.Linear(2048, self.out_channels),
948
+ )
949
+ elif pool == "spatial_v2":
950
+ self.out = nn.Sequential(
951
+ nn.Linear(self._feature_size, 2048),
952
+ normalization(2048),
953
+ nn.SiLU(),
954
+ nn.Linear(2048, self.out_channels),
955
+ )
956
+ else:
957
+ raise NotImplementedError(f"Unexpected {pool} pooling")
958
+
959
+ def convert_to_fp16(self):
960
+ """
961
+ Convert the torso of the model to float16.
962
+ """
963
+ self.input_blocks.apply(convert_module_to_f16)
964
+ self.middle_block.apply(convert_module_to_f16)
965
+
966
+ def convert_to_fp32(self):
967
+ """
968
+ Convert the torso of the model to float32.
969
+ """
970
+ self.input_blocks.apply(convert_module_to_f32)
971
+ self.middle_block.apply(convert_module_to_f32)
972
+
973
+ def forward(self, x, timesteps):
974
+ """
975
+ Apply the model to an input batch.
976
+ :param x: an [N x C x ...] Tensor of inputs.
977
+ :param timesteps: a 1-D batch of timesteps.
978
+ :return: an [N x K] Tensor of outputs.
979
+ """
980
+ emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
981
+
982
+ results = []
983
+ h = x.type(self.dtype)
984
+ for module in self.input_blocks:
985
+ h = module(h, emb)
986
+ if self.pool.startswith("spatial"):
987
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
988
+ h = self.middle_block(h, emb)
989
+ if self.pool.startswith("spatial"):
990
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
991
+ h = th.cat(results, axis=-1)
992
+ return self.out(h)
993
+ else:
994
+ h = h.type(x.dtype)
995
+ return self.out(h)
996
+
ldm/modules/diffusionmodules/util.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adopted from
2
+ # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
3
+ # and
4
+ # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
5
+ # and
6
+ # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
7
+ #
8
+ # thanks!
9
+
10
+
11
+ import os
12
+ import math
13
+ import torch
14
+ import torch.nn as nn
15
+ import numpy as np
16
+ from einops import repeat
17
+
18
+ from ldm.util import instantiate_from_config
19
+
20
+
21
+ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
22
+ if schedule == "linear":
23
+ betas = (
24
+ torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
25
+ )
26
+
27
+ elif schedule == "cosine":
28
+ timesteps = (
29
+ torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
30
+ )
31
+ alphas = timesteps / (1 + cosine_s) * np.pi / 2
32
+ alphas = torch.cos(alphas).pow(2)
33
+ alphas = alphas / alphas[0]
34
+ betas = 1 - alphas[1:] / alphas[:-1]
35
+ betas = np.clip(betas, a_min=0, a_max=0.999)
36
+
37
+ elif schedule == "sqrt_linear":
38
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
39
+ elif schedule == "sqrt":
40
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
41
+ else:
42
+ raise ValueError(f"schedule '{schedule}' unknown.")
43
+ return betas.numpy()
44
+
45
+
46
+ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
47
+ if ddim_discr_method == 'uniform':
48
+ c = num_ddpm_timesteps // num_ddim_timesteps
49
+ ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
50
+ elif ddim_discr_method == 'quad':
51
+ ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
52
+ else:
53
+ raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
54
+
55
+ # assert ddim_timesteps.shape[0] == num_ddim_timesteps
56
+ # add one to get the final alpha values right (the ones from first scale to data during sampling)
57
+ steps_out = ddim_timesteps + 1
58
+ if verbose:
59
+ print(f'Selected timesteps for ddim sampler: {steps_out}')
60
+ return steps_out
61
+
62
+
63
+ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
64
+ # select alphas for computing the variance schedule
65
+ alphas = alphacums[ddim_timesteps]
66
+ alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
67
+
68
+ # according the the formula provided in https://arxiv.org/abs/2010.02502
69
+ sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
70
+ if verbose:
71
+ print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
72
+ print(f'For the chosen value of eta, which is {eta}, '
73
+ f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
74
+ return sigmas, alphas, alphas_prev
75
+
76
+
77
+ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
78
+ """
79
+ Create a beta schedule that discretizes the given alpha_t_bar function,
80
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
81
+ :param num_diffusion_timesteps: the number of betas to produce.
82
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
83
+ produces the cumulative product of (1-beta) up to that
84
+ part of the diffusion process.
85
+ :param max_beta: the maximum beta to use; use values lower than 1 to
86
+ prevent singularities.
87
+ """
88
+ betas = []
89
+ for i in range(num_diffusion_timesteps):
90
+ t1 = i / num_diffusion_timesteps
91
+ t2 = (i + 1) / num_diffusion_timesteps
92
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
93
+ return np.array(betas)
94
+
95
+
96
+ def extract_into_tensor(a, t, x_shape):
97
+ b, *_ = t.shape
98
+ out = a.gather(-1, t)
99
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
100
+
101
+
102
+ def checkpoint(func, inputs, params, flag):
103
+ """
104
+ Evaluate a function without caching intermediate activations, allowing for
105
+ reduced memory at the expense of extra compute in the backward pass.
106
+ :param func: the function to evaluate.
107
+ :param inputs: the argument sequence to pass to `func`.
108
+ :param params: a sequence of parameters `func` depends on but does not
109
+ explicitly take as arguments.
110
+ :param flag: if False, disable gradient checkpointing.
111
+ """
112
+ if flag:
113
+ args = tuple(inputs) + tuple(params)
114
+ return CheckpointFunction.apply(func, len(inputs), *args)
115
+ else:
116
+ return func(*inputs)
117
+
118
+
119
+ class CheckpointFunction(torch.autograd.Function):
120
+ @staticmethod
121
+ def forward(ctx, run_function, length, *args):
122
+ ctx.run_function = run_function
123
+ ctx.input_tensors = list(args[:length])
124
+ ctx.input_params = list(args[length:])
125
+
126
+ with torch.no_grad():
127
+ output_tensors = ctx.run_function(*ctx.input_tensors)
128
+ return output_tensors
129
+
130
+ @staticmethod
131
+ def backward(ctx, *output_grads):
132
+ ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
133
+ with torch.enable_grad():
134
+ # Fixes a bug where the first op in run_function modifies the
135
+ # Tensor storage in place, which is not allowed for detach()'d
136
+ # Tensors.
137
+ shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
138
+ output_tensors = ctx.run_function(*shallow_copies)
139
+ input_grads = torch.autograd.grad(
140
+ output_tensors,
141
+ ctx.input_tensors + ctx.input_params,
142
+ output_grads,
143
+ allow_unused=True,
144
+ )
145
+ del ctx.input_tensors
146
+ del ctx.input_params
147
+ del output_tensors
148
+ return (None, None) + input_grads
149
+
150
+
151
+ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
152
+ """
153
+ Create sinusoidal timestep embeddings.
154
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
155
+ These may be fractional.
156
+ :param dim: the dimension of the output.
157
+ :param max_period: controls the minimum frequency of the embeddings.
158
+ :return: an [N x dim] Tensor of positional embeddings.
159
+ """
160
+ if not repeat_only:
161
+ half = dim // 2
162
+ freqs = torch.exp(
163
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
164
+ ).to(device=timesteps.device)
165
+ args = timesteps[:, None].float() * freqs[None]
166
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
167
+ if dim % 2:
168
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
169
+ else:
170
+ embedding = repeat(timesteps, 'b -> b d', d=dim)
171
+ return embedding
172
+
173
+
174
+ def zero_module(module):
175
+ """
176
+ Zero out the parameters of a module and return it.
177
+ """
178
+ for p in module.parameters():
179
+ p.detach().zero_()
180
+ return module
181
+
182
+
183
+ def scale_module(module, scale):
184
+ """
185
+ Scale the parameters of a module and return it.
186
+ """
187
+ for p in module.parameters():
188
+ p.detach().mul_(scale)
189
+ return module
190
+
191
+
192
+ def mean_flat(tensor):
193
+ """
194
+ Take the mean over all non-batch dimensions.
195
+ """
196
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
197
+
198
+
199
+ def normalization(channels):
200
+ """
201
+ Make a standard normalization layer.
202
+ :param channels: number of input channels.
203
+ :return: an nn.Module for normalization.
204
+ """
205
+ return GroupNorm32(32, channels)
206
+
207
+
208
+ # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
209
+ class SiLU(nn.Module):
210
+ def forward(self, x):
211
+ return x * torch.sigmoid(x)
212
+
213
+
214
+ class GroupNorm32(nn.GroupNorm):
215
+ def forward(self, x):
216
+ return super().forward(x.float()).type(x.dtype)
217
+
218
+ def conv_nd(dims, *args, **kwargs):
219
+ """
220
+ Create a 1D, 2D, or 3D convolution module.
221
+ """
222
+ if dims == 1:
223
+ return nn.Conv1d(*args, **kwargs)
224
+ elif dims == 2:
225
+ return nn.Conv2d(*args, **kwargs)
226
+ elif dims == 3:
227
+ return nn.Conv3d(*args, **kwargs)
228
+ raise ValueError(f"unsupported dimensions: {dims}")
229
+
230
+
231
+ def linear(*args, **kwargs):
232
+ """
233
+ Create a linear module.
234
+ """
235
+ return nn.Linear(*args, **kwargs)
236
+
237
+
238
+ def avg_pool_nd(dims, *args, **kwargs):
239
+ """
240
+ Create a 1D, 2D, or 3D average pooling module.
241
+ """
242
+ if dims == 1:
243
+ return nn.AvgPool1d(*args, **kwargs)
244
+ elif dims == 2:
245
+ return nn.AvgPool2d(*args, **kwargs)
246
+ elif dims == 3:
247
+ return nn.AvgPool3d(*args, **kwargs)
248
+ raise ValueError(f"unsupported dimensions: {dims}")
249
+
250
+
251
+ class HybridConditioner(nn.Module):
252
+
253
+ def __init__(self, c_concat_config, c_crossattn_config):
254
+ super().__init__()
255
+ self.concat_conditioner = instantiate_from_config(c_concat_config)
256
+ self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
257
+
258
+ def forward(self, c_concat, c_crossattn):
259
+ c_concat = self.concat_conditioner(c_concat)
260
+ c_crossattn = self.crossattn_conditioner(c_crossattn)
261
+ return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
262
+
263
+
264
+ def noise_like(shape, device, repeat=False):
265
+ repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
266
+ noise = lambda: torch.randn(shape, device=device)
267
+ return repeat_noise() if repeat else noise()
ldm/modules/distributions/__init__.py ADDED
File without changes
ldm/modules/distributions/distributions.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ class AbstractDistribution:
6
+ def sample(self):
7
+ raise NotImplementedError()
8
+
9
+ def mode(self):
10
+ raise NotImplementedError()
11
+
12
+
13
+ class DiracDistribution(AbstractDistribution):
14
+ def __init__(self, value):
15
+ self.value = value
16
+
17
+ def sample(self):
18
+ return self.value
19
+
20
+ def mode(self):
21
+ return self.value
22
+
23
+
24
+ class DiagonalGaussianDistribution(object):
25
+ def __init__(self, parameters, deterministic=False):
26
+ self.parameters = parameters
27
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29
+ self.deterministic = deterministic
30
+ self.std = torch.exp(0.5 * self.logvar)
31
+ self.var = torch.exp(self.logvar)
32
+ if self.deterministic:
33
+ self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34
+
35
+ def sample(self):
36
+ x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37
+ return x
38
+
39
+ def kl(self, other=None):
40
+ if self.deterministic:
41
+ return torch.Tensor([0.])
42
+ else:
43
+ if other is None:
44
+ return 0.5 * torch.sum(torch.pow(self.mean, 2)
45
+ + self.var - 1.0 - self.logvar,
46
+ dim=[1, 2, 3])
47
+ else:
48
+ return 0.5 * torch.sum(
49
+ torch.pow(self.mean - other.mean, 2) / other.var
50
+ + self.var / other.var - 1.0 - self.logvar + other.logvar,
51
+ dim=[1, 2, 3])
52
+
53
+ def nll(self, sample, dims=[1,2,3]):
54
+ if self.deterministic:
55
+ return torch.Tensor([0.])
56
+ logtwopi = np.log(2.0 * np.pi)
57
+ return 0.5 * torch.sum(
58
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59
+ dim=dims)
60
+
61
+ def mode(self):
62
+ return self.mean
63
+
64
+
65
+ def normal_kl(mean1, logvar1, mean2, logvar2):
66
+ """
67
+ source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68
+ Compute the KL divergence between two gaussians.
69
+ Shapes are automatically broadcasted, so batches can be compared to
70
+ scalars, among other use cases.
71
+ """
72
+ tensor = None
73
+ for obj in (mean1, logvar1, mean2, logvar2):
74
+ if isinstance(obj, torch.Tensor):
75
+ tensor = obj
76
+ break
77
+ assert tensor is not None, "at least one argument must be a Tensor"
78
+
79
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
80
+ # Tensors, but it does not work for torch.exp().
81
+ logvar1, logvar2 = [
82
+ x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83
+ for x in (logvar1, logvar2)
84
+ ]
85
+
86
+ return 0.5 * (
87
+ -1.0
88
+ + logvar2
89
+ - logvar1
90
+ + torch.exp(logvar1 - logvar2)
91
+ + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92
+ )
ldm/modules/ema.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+
5
+ class LitEma(nn.Module):
6
+ def __init__(self, model, decay=0.9999, use_num_upates=True):
7
+ super().__init__()
8
+ if decay < 0.0 or decay > 1.0:
9
+ raise ValueError('Decay must be between 0 and 1')
10
+
11
+ self.m_name2s_name = {}
12
+ self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13
+ self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14
+ else torch.tensor(-1,dtype=torch.int))
15
+
16
+ for name, p in model.named_parameters():
17
+ if p.requires_grad:
18
+ #remove as '.'-character is not allowed in buffers
19
+ s_name = name.replace('.','')
20
+ self.m_name2s_name.update({name:s_name})
21
+ self.register_buffer(s_name,p.clone().detach().data)
22
+
23
+ self.collected_params = []
24
+
25
+ def forward(self,model):
26
+ decay = self.decay
27
+
28
+ if self.num_updates >= 0:
29
+ self.num_updates += 1
30
+ decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31
+
32
+ one_minus_decay = 1.0 - decay
33
+
34
+ with torch.no_grad():
35
+ m_param = dict(model.named_parameters())
36
+ shadow_params = dict(self.named_buffers())
37
+
38
+ for key in m_param:
39
+ if m_param[key].requires_grad:
40
+ sname = self.m_name2s_name[key]
41
+ shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42
+ shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43
+ else:
44
+ assert not key in self.m_name2s_name
45
+
46
+ def copy_to(self, model):
47
+ m_param = dict(model.named_parameters())
48
+ shadow_params = dict(self.named_buffers())
49
+ for key in m_param:
50
+ if m_param[key].requires_grad:
51
+ m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52
+ else:
53
+ assert not key in self.m_name2s_name
54
+
55
+ def store(self, parameters):
56
+ """
57
+ Save the current parameters for restoring later.
58
+ Args:
59
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60
+ temporarily stored.
61
+ """
62
+ self.collected_params = [param.clone() for param in parameters]
63
+
64
+ def restore(self, parameters):
65
+ """
66
+ Restore the parameters stored with the `store` method.
67
+ Useful to validate the model with EMA parameters without affecting the
68
+ original optimization process. Store the parameters before the
69
+ `copy_to` method. After validation (or model saving), use this to
70
+ restore the former parameters.
71
+ Args:
72
+ parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73
+ updated with the stored parameters.
74
+ """
75
+ for c_param, param in zip(self.collected_params, parameters):
76
+ param.data.copy_(c_param.data)
ldm/modules/encoders/__init__.py ADDED
File without changes
ldm/modules/encoders/modules.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from functools import partial
5
+ import kornia
6
+
7
+ from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
8
+ from ldm.util import default
9
+ import clip
10
+
11
+
12
+ class AbstractEncoder(nn.Module):
13
+ def __init__(self):
14
+ super().__init__()
15
+
16
+ def encode(self, *args, **kwargs):
17
+ raise NotImplementedError
18
+
19
+ class IdentityEncoder(AbstractEncoder):
20
+
21
+ def encode(self, x):
22
+ return x
23
+
24
+ class FaceClipEncoder(AbstractEncoder):
25
+ def __init__(self, augment=True, retreival_key=None):
26
+ super().__init__()
27
+ self.encoder = FrozenCLIPImageEmbedder()
28
+ self.augment = augment
29
+ self.retreival_key = retreival_key
30
+
31
+ def forward(self, img):
32
+ encodings = []
33
+ with torch.no_grad():
34
+ x_offset = 125
35
+ if self.retreival_key:
36
+ # Assumes retrieved image are packed into the second half of channels
37
+ face = img[:,3:,190:440,x_offset:(512-x_offset)]
38
+ other = img[:,:3,...].clone()
39
+ else:
40
+ face = img[:,:,190:440,x_offset:(512-x_offset)]
41
+ other = img.clone()
42
+
43
+ if self.augment:
44
+ face = K.RandomHorizontalFlip()(face)
45
+
46
+ other[:,:,190:440,x_offset:(512-x_offset)] *= 0
47
+ encodings = [
48
+ self.encoder.encode(face),
49
+ self.encoder.encode(other),
50
+ ]
51
+
52
+ return torch.cat(encodings, dim=1)
53
+
54
+ def encode(self, img):
55
+ if isinstance(img, list):
56
+ # Uncondition
57
+ return torch.zeros((1, 2, 768), device=self.encoder.model.visual.conv1.weight.device)
58
+
59
+ return self(img)
60
+
61
+ class FaceIdClipEncoder(AbstractEncoder):
62
+ def __init__(self):
63
+ super().__init__()
64
+ self.encoder = FrozenCLIPImageEmbedder()
65
+ for p in self.encoder.parameters():
66
+ p.requires_grad = False
67
+ self.id = FrozenFaceEncoder("/home/jpinkney/code/stable-diffusion/model_ir_se50.pth", augment=True)
68
+
69
+ def forward(self, img):
70
+ encodings = []
71
+ with torch.no_grad():
72
+ face = kornia.geometry.resize(img, (256, 256),
73
+ interpolation='bilinear', align_corners=True)
74
+
75
+ other = img.clone()
76
+ other[:,:,184:452,122:396] *= 0
77
+ encodings = [
78
+ self.id.encode(face),
79
+ self.encoder.encode(other),
80
+ ]
81
+
82
+ return torch.cat(encodings, dim=1)
83
+
84
+ def encode(self, img):
85
+ if isinstance(img, list):
86
+ # Uncondition
87
+ return torch.zeros((1, 2, 768), device=self.encoder.model.visual.conv1.weight.device)
88
+
89
+ return self(img)
90
+
91
+ class ClassEmbedder(nn.Module):
92
+ def __init__(self, embed_dim, n_classes=1000, key='class'):
93
+ super().__init__()
94
+ self.key = key
95
+ self.embedding = nn.Embedding(n_classes, embed_dim)
96
+
97
+ def forward(self, batch, key=None):
98
+ if key is None:
99
+ key = self.key
100
+ # this is for use in crossattn
101
+ c = batch[key][:, None]
102
+ c = self.embedding(c)
103
+ return c
104
+
105
+
106
+ class TransformerEmbedder(AbstractEncoder):
107
+ """Some transformer encoder layers"""
108
+ def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
109
+ super().__init__()
110
+ self.device = device
111
+ self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
112
+ attn_layers=Encoder(dim=n_embed, depth=n_layer))
113
+
114
+ def forward(self, tokens):
115
+ tokens = tokens.to(self.device) # meh
116
+ z = self.transformer(tokens, return_embeddings=True)
117
+ return z
118
+
119
+ def encode(self, x):
120
+ return self(x)
121
+
122
+
123
+ class BERTTokenizer(AbstractEncoder):
124
+ """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
125
+ def __init__(self, device="cuda", vq_interface=True, max_length=77):
126
+ super().__init__()
127
+ from transformers import BertTokenizerFast # TODO: add to reuquirements
128
+ self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
129
+ self.device = device
130
+ self.vq_interface = vq_interface
131
+ self.max_length = max_length
132
+
133
+ def forward(self, text):
134
+ batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
135
+ return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
136
+ tokens = batch_encoding["input_ids"].to(self.device)
137
+ return tokens
138
+
139
+ @torch.no_grad()
140
+ def encode(self, text):
141
+ tokens = self(text)
142
+ if not self.vq_interface:
143
+ return tokens
144
+ return None, None, [None, None, tokens]
145
+
146
+ def decode(self, text):
147
+ return text
148
+
149
+
150
+ class BERTEmbedder(AbstractEncoder):
151
+ """Uses the BERT tokenizr model and add some transformer encoder layers"""
152
+ def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
153
+ device="cuda",use_tokenizer=True, embedding_dropout=0.0):
154
+ super().__init__()
155
+ self.use_tknz_fn = use_tokenizer
156
+ if self.use_tknz_fn:
157
+ self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
158
+ self.device = device
159
+ self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
160
+ attn_layers=Encoder(dim=n_embed, depth=n_layer),
161
+ emb_dropout=embedding_dropout)
162
+
163
+ def forward(self, text):
164
+ if self.use_tknz_fn:
165
+ tokens = self.tknz_fn(text)#.to(self.device)
166
+ else:
167
+ tokens = text
168
+ z = self.transformer(tokens, return_embeddings=True)
169
+ return z
170
+
171
+ def encode(self, text):
172
+ # output of length 77
173
+ return self(text)
174
+
175
+
176
+ from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
177
+
178
+ def disabled_train(self, mode=True):
179
+ """Overwrite model.train with this function to make sure train/eval mode
180
+ does not change anymore."""
181
+ return self
182
+
183
+
184
+ class FrozenT5Embedder(AbstractEncoder):
185
+ """Uses the T5 transformer encoder for text"""
186
+ def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
187
+ super().__init__()
188
+ self.tokenizer = T5Tokenizer.from_pretrained(version)
189
+ self.transformer = T5EncoderModel.from_pretrained(version)
190
+ self.device = device
191
+ self.max_length = max_length # TODO: typical value?
192
+ self.freeze()
193
+
194
+ def freeze(self):
195
+ self.transformer = self.transformer.eval()
196
+ #self.train = disabled_train
197
+ for param in self.parameters():
198
+ param.requires_grad = False
199
+
200
+ def forward(self, text):
201
+ batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
202
+ return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
203
+ tokens = batch_encoding["input_ids"].to(self.device)
204
+ outputs = self.transformer(input_ids=tokens)
205
+
206
+ z = outputs.last_hidden_state
207
+ return z
208
+
209
+ def encode(self, text):
210
+ return self(text)
211
+
212
+ from ldm.thirdp.psp.id_loss import IDFeatures
213
+ import kornia.augmentation as K
214
+
215
+ class FrozenFaceEncoder(AbstractEncoder):
216
+ def __init__(self, model_path, augment=False):
217
+ super().__init__()
218
+ self.loss_fn = IDFeatures(model_path)
219
+ # face encoder is frozen
220
+ for p in self.loss_fn.parameters():
221
+ p.requires_grad = False
222
+ # Mapper is trainable
223
+ self.mapper = torch.nn.Linear(512, 768)
224
+ p = 0.25
225
+ if augment:
226
+ self.augment = K.AugmentationSequential(
227
+ K.RandomHorizontalFlip(p=0.5),
228
+ K.RandomEqualize(p=p),
229
+ # K.RandomPlanckianJitter(p=p),
230
+ # K.RandomPlasmaBrightness(p=p),
231
+ # K.RandomPlasmaContrast(p=p),
232
+ # K.ColorJiggle(0.02, 0.2, 0.2, p=p),
233
+ )
234
+ else:
235
+ self.augment = False
236
+
237
+ def forward(self, img):
238
+ if isinstance(img, list):
239
+ # Uncondition
240
+ return torch.zeros((1, 1, 768), device=self.mapper.weight.device)
241
+
242
+ if self.augment is not None:
243
+ # Transforms require 0-1
244
+ img = self.augment((img + 1)/2)
245
+ img = 2*img - 1
246
+
247
+ feat = self.loss_fn(img, crop=True)
248
+ feat = self.mapper(feat.unsqueeze(1))
249
+ return feat
250
+
251
+ def encode(self, img):
252
+ return self(img)
253
+
254
+ class FrozenCLIPEmbedder(AbstractEncoder):
255
+ """Uses the CLIP transformer encoder for text (from huggingface)"""
256
+ def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77): # clip-vit-base-patch32
257
+ super().__init__()
258
+ self.tokenizer = CLIPTokenizer.from_pretrained(version)
259
+ self.transformer = CLIPTextModel.from_pretrained(version)
260
+ self.device = device
261
+ self.max_length = max_length # TODO: typical value?
262
+ self.freeze()
263
+
264
+ def freeze(self):
265
+ self.transformer = self.transformer.eval()
266
+ #self.train = disabled_train
267
+ for param in self.parameters():
268
+ param.requires_grad = False
269
+
270
+ def forward(self, text):
271
+ batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
272
+ return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
273
+ tokens = batch_encoding["input_ids"].to(self.device)
274
+ outputs = self.transformer(input_ids=tokens)
275
+
276
+ z = outputs.last_hidden_state
277
+ return z
278
+
279
+ def encode(self, text):
280
+ return self(text)
281
+
282
+ import torch.nn.functional as F
283
+ from transformers import CLIPVisionModel
284
+ class ClipImageProjector(AbstractEncoder):
285
+ """
286
+ Uses the CLIP image encoder.
287
+ """
288
+ def __init__(self, version="openai/clip-vit-large-patch14", max_length=77): # clip-vit-base-patch32
289
+ super().__init__()
290
+ self.model = CLIPVisionModel.from_pretrained(version)
291
+ self.model.train()
292
+ self.max_length = max_length # TODO: typical value?
293
+ self.antialias = True
294
+ self.mapper = torch.nn.Linear(1024, 768)
295
+ self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
296
+ self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
297
+ null_cond = self.get_null_cond(version, max_length)
298
+ self.register_buffer('null_cond', null_cond)
299
+
300
+ @torch.no_grad()
301
+ def get_null_cond(self, version, max_length):
302
+ device = self.mean.device
303
+ embedder = FrozenCLIPEmbedder(version=version, device=device, max_length=max_length)
304
+ null_cond = embedder([""])
305
+ return null_cond
306
+
307
+ def preprocess(self, x):
308
+ # Expects inputs in the range -1, 1
309
+ x = kornia.geometry.resize(x, (224, 224),
310
+ interpolation='bicubic',align_corners=True,
311
+ antialias=self.antialias)
312
+ x = (x + 1.) / 2.
313
+ # renormalize according to clip
314
+ x = kornia.enhance.normalize(x, self.mean, self.std)
315
+ return x
316
+
317
+ def forward(self, x):
318
+ if isinstance(x, list):
319
+ return self.null_cond
320
+ # x is assumed to be in range [-1,1]
321
+ x = self.preprocess(x)
322
+ outputs = self.model(pixel_values=x)
323
+ last_hidden_state = outputs.last_hidden_state
324
+ last_hidden_state = self.mapper(last_hidden_state)
325
+ return F.pad(last_hidden_state, [0,0, 0,self.max_length-last_hidden_state.shape[1], 0,0])
326
+
327
+ def encode(self, im):
328
+ return self(im)
329
+
330
+ class ProjectedFrozenCLIPEmbedder(AbstractEncoder):
331
+ def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77): # clip-vit-base-patch32
332
+ super().__init__()
333
+ self.embedder = FrozenCLIPEmbedder(version=version, device=device, max_length=max_length)
334
+ self.projection = torch.nn.Linear(768, 768)
335
+
336
+ def forward(self, text):
337
+ z = self.embedder(text)
338
+ return self.projection(z)
339
+
340
+ def encode(self, text):
341
+ return self(text)
342
+
343
+ class FrozenCLIPImageEmbedder(AbstractEncoder):
344
+ """
345
+ Uses the CLIP image encoder.
346
+ Not actually frozen... If you want that set cond_stage_trainable=False in cfg
347
+ """
348
+ def __init__(
349
+ self,
350
+ model='ViT-L/14',
351
+ jit=False,
352
+ device='cpu',
353
+ antialias=False,
354
+ ):
355
+ super().__init__()
356
+ self.model, _ = clip.load(name=model, device=device, jit=jit)
357
+ # We don't use the text part so delete it
358
+ del self.model.transformer
359
+ self.antialias = antialias
360
+ self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
361
+ self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
362
+
363
+ def preprocess(self, x):
364
+ # Expects inputs in the range -1, 1
365
+ x = kornia.geometry.resize(x, (224, 224),
366
+ interpolation='bicubic',align_corners=True,
367
+ antialias=self.antialias)
368
+ x = (x + 1.) / 2.
369
+ # renormalize according to clip
370
+ x = kornia.enhance.normalize(x, self.mean, self.std)
371
+ return x
372
+
373
+ def forward(self, x):
374
+ # x is assumed to be in range [-1,1]
375
+ if isinstance(x, list):
376
+ # [""] denotes condition dropout for ucg
377
+ device = self.model.visual.conv1.weight.device
378
+ return torch.zeros(1, 768, device=device)
379
+ return self.model.encode_image(self.preprocess(x)).float()
380
+
381
+ def encode(self, im):
382
+ return self(im).unsqueeze(1)
383
+
384
+ from torchvision import transforms
385
+ import random
386
+
387
+ class FrozenCLIPImageMutliEmbedder(AbstractEncoder):
388
+ """
389
+ Uses the CLIP image encoder.
390
+ Not actually frozen... If you want that set cond_stage_trainable=False in cfg
391
+ """
392
+ def __init__(
393
+ self,
394
+ model='ViT-L/14',
395
+ jit=False,
396
+ device='cpu',
397
+ antialias=True,
398
+ max_crops=5,
399
+ ):
400
+ super().__init__()
401
+ self.model, _ = clip.load(name=model, device=device, jit=jit)
402
+ # We don't use the text part so delete it
403
+ del self.model.transformer
404
+ self.antialias = antialias
405
+ self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
406
+ self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
407
+ self.max_crops = max_crops
408
+
409
+ def preprocess(self, x):
410
+
411
+ # Expects inputs in the range -1, 1
412
+ randcrop = transforms.RandomResizedCrop(224, scale=(0.085, 1.0), ratio=(1,1))
413
+ max_crops = self.max_crops
414
+ patches = []
415
+ crops = [randcrop(x) for _ in range(max_crops)]
416
+ patches.extend(crops)
417
+ x = torch.cat(patches, dim=0)
418
+ x = (x + 1.) / 2.
419
+ # renormalize according to clip
420
+ x = kornia.enhance.normalize(x, self.mean, self.std)
421
+ return x
422
+
423
+ def forward(self, x):
424
+ # x is assumed to be in range [-1,1]
425
+ if isinstance(x, list):
426
+ # [""] denotes condition dropout for ucg
427
+ device = self.model.visual.conv1.weight.device
428
+ return torch.zeros(1, self.max_crops, 768, device=device)
429
+ batch_tokens = []
430
+ for im in x:
431
+ patches = self.preprocess(im.unsqueeze(0))
432
+ tokens = self.model.encode_image(patches).float()
433
+ for t in tokens:
434
+ if random.random() < 0.1:
435
+ t *= 0
436
+ batch_tokens.append(tokens.unsqueeze(0))
437
+
438
+ return torch.cat(batch_tokens, dim=0)
439
+
440
+ def encode(self, im):
441
+ return self(im)
442
+
443
+ class SpatialRescaler(nn.Module):
444
+ def __init__(self,
445
+ n_stages=1,
446
+ method='bilinear',
447
+ multiplier=0.5,
448
+ in_channels=3,
449
+ out_channels=None,
450
+ bias=False):
451
+ super().__init__()
452
+ self.n_stages = n_stages
453
+ assert self.n_stages >= 0
454
+ assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
455
+ self.multiplier = multiplier
456
+ self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
457
+ self.remap_output = out_channels is not None
458
+ if self.remap_output:
459
+ print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
460
+ self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
461
+
462
+ def forward(self,x):
463
+ for stage in range(self.n_stages):
464
+ x = self.interpolator(x, scale_factor=self.multiplier)
465
+
466
+
467
+ if self.remap_output:
468
+ x = self.channel_mapper(x)
469
+ return x
470
+
471
+ def encode(self, x):
472
+ return self(x)
473
+
474
+
475
+ from ldm.util import instantiate_from_config
476
+ from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
477
+
478
+
479
+ class LowScaleEncoder(nn.Module):
480
+ def __init__(self, model_config, linear_start, linear_end, timesteps=1000, max_noise_level=250, output_size=64,
481
+ scale_factor=1.0):
482
+ super().__init__()
483
+ self.max_noise_level = max_noise_level
484
+ self.model = instantiate_from_config(model_config)
485
+ self.augmentation_schedule = self.register_schedule(timesteps=timesteps, linear_start=linear_start,
486
+ linear_end=linear_end)
487
+ self.out_size = output_size
488
+ self.scale_factor = scale_factor
489
+
490
+ def register_schedule(self, beta_schedule="linear", timesteps=1000,
491
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
492
+ betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
493
+ cosine_s=cosine_s)
494
+ alphas = 1. - betas
495
+ alphas_cumprod = np.cumprod(alphas, axis=0)
496
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
497
+
498
+ timesteps, = betas.shape
499
+ self.num_timesteps = int(timesteps)
500
+ self.linear_start = linear_start
501
+ self.linear_end = linear_end
502
+ assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
503
+
504
+ to_torch = partial(torch.tensor, dtype=torch.float32)
505
+
506
+ self.register_buffer('betas', to_torch(betas))
507
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
508
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
509
+
510
+ # calculations for diffusion q(x_t | x_{t-1}) and others
511
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
512
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
513
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
514
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
515
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
516
+
517
+ def q_sample(self, x_start, t, noise=None):
518
+ noise = default(noise, lambda: torch.randn_like(x_start))
519
+ return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
520
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
521
+
522
+ def forward(self, x):
523
+ z = self.model.encode(x).sample()
524
+ z = z * self.scale_factor
525
+ noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
526
+ z = self.q_sample(z, noise_level)
527
+ if self.out_size is not None:
528
+ z = torch.nn.functional.interpolate(z, size=self.out_size, mode="nearest") # TODO: experiment with mode
529
+ # z = z.repeat_interleave(2, -2).repeat_interleave(2, -1)
530
+ return z, noise_level
531
+
532
+ def decode(self, z):
533
+ z = z / self.scale_factor
534
+ return self.model.decode(z)
535
+
536
+
537
+ if __name__ == "__main__":
538
+ from ldm.util import count_params
539
+ sentences = ["a hedgehog drinking a whiskey", "der mond ist aufgegangen", "Ein Satz mit vielen Sonderzeichen: äöü ß ?! : 'xx-y/@s'"]
540
+ model = FrozenT5Embedder(version="google/t5-v1_1-xl").cuda()
541
+ count_params(model, True)
542
+ z = model(sentences)
543
+ print(z.shape)
544
+
545
+ model = FrozenCLIPEmbedder().cuda()
546
+ count_params(model, True)
547
+ z = model(sentences)
548
+ print(z.shape)
549
+
550
+ print("done.")
ldm/modules/evaluate/adm_evaluator.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import io
3
+ import os
4
+ import random
5
+ import warnings
6
+ import zipfile
7
+ from abc import ABC, abstractmethod
8
+ from contextlib import contextmanager
9
+ from functools import partial
10
+ from multiprocessing import cpu_count
11
+ from multiprocessing.pool import ThreadPool
12
+ from typing import Iterable, Optional, Tuple
13
+ import yaml
14
+
15
+ import numpy as np
16
+ import requests
17
+ import tensorflow.compat.v1 as tf
18
+ from scipy import linalg
19
+ from tqdm.auto import tqdm
20
+
21
+ INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
22
+ INCEPTION_V3_PATH = "classify_image_graph_def.pb"
23
+
24
+ FID_POOL_NAME = "pool_3:0"
25
+ FID_SPATIAL_NAME = "mixed_6/conv:0"
26
+
27
+ REQUIREMENTS = f"This script has the following requirements: \n" \
28
+ 'tensorflow-gpu>=2.0' + "\n" + 'scipy' + "\n" + "requests" + "\n" + "tqdm"
29
+
30
+
31
+ def main():
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument("--ref_batch", help="path to reference batch npz file")
34
+ parser.add_argument("--sample_batch", help="path to sample batch npz file")
35
+ args = parser.parse_args()
36
+
37
+ config = tf.ConfigProto(
38
+ allow_soft_placement=True # allows DecodeJpeg to run on CPU in Inception graph
39
+ )
40
+ config.gpu_options.allow_growth = True
41
+ evaluator = Evaluator(tf.Session(config=config))
42
+
43
+ print("warming up TensorFlow...")
44
+ # This will cause TF to print a bunch of verbose stuff now rather
45
+ # than after the next print(), to help prevent confusion.
46
+ evaluator.warmup()
47
+
48
+ print("computing reference batch activations...")
49
+ ref_acts = evaluator.read_activations(args.ref_batch)
50
+ print("computing/reading reference batch statistics...")
51
+ ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
52
+
53
+ print("computing sample batch activations...")
54
+ sample_acts = evaluator.read_activations(args.sample_batch)
55
+ print("computing/reading sample batch statistics...")
56
+ sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
57
+
58
+ print("Computing evaluations...")
59
+ is_ = evaluator.compute_inception_score(sample_acts[0])
60
+ print("Inception Score:", is_)
61
+ fid = sample_stats.frechet_distance(ref_stats)
62
+ print("FID:", fid)
63
+ sfid = sample_stats_spatial.frechet_distance(ref_stats_spatial)
64
+ print("sFID:", sfid)
65
+ prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
66
+ print("Precision:", prec)
67
+ print("Recall:", recall)
68
+
69
+ savepath = '/'.join(args.sample_batch.split('/')[:-1])
70
+ results_file = os.path.join(savepath,'evaluation_metrics.yaml')
71
+ print(f'Saving evaluation results to "{results_file}"')
72
+
73
+ results = {
74
+ 'IS': is_,
75
+ 'FID': fid,
76
+ 'sFID': sfid,
77
+ 'Precision:':prec,
78
+ 'Recall': recall
79
+ }
80
+
81
+ with open(results_file, 'w') as f:
82
+ yaml.dump(results, f, default_flow_style=False)
83
+
84
+ class InvalidFIDException(Exception):
85
+ pass
86
+
87
+
88
+ class FIDStatistics:
89
+ def __init__(self, mu: np.ndarray, sigma: np.ndarray):
90
+ self.mu = mu
91
+ self.sigma = sigma
92
+
93
+ def frechet_distance(self, other, eps=1e-6):
94
+ """
95
+ Compute the Frechet distance between two sets of statistics.
96
+ """
97
+ # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
98
+ mu1, sigma1 = self.mu, self.sigma
99
+ mu2, sigma2 = other.mu, other.sigma
100
+
101
+ mu1 = np.atleast_1d(mu1)
102
+ mu2 = np.atleast_1d(mu2)
103
+
104
+ sigma1 = np.atleast_2d(sigma1)
105
+ sigma2 = np.atleast_2d(sigma2)
106
+
107
+ assert (
108
+ mu1.shape == mu2.shape
109
+ ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
110
+ assert (
111
+ sigma1.shape == sigma2.shape
112
+ ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
113
+
114
+ diff = mu1 - mu2
115
+
116
+ # product might be almost singular
117
+ covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
118
+ if not np.isfinite(covmean).all():
119
+ msg = (
120
+ "fid calculation produces singular product; adding %s to diagonal of cov estimates"
121
+ % eps
122
+ )
123
+ warnings.warn(msg)
124
+ offset = np.eye(sigma1.shape[0]) * eps
125
+ covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
126
+
127
+ # numerical error might give slight imaginary component
128
+ if np.iscomplexobj(covmean):
129
+ if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
130
+ m = np.max(np.abs(covmean.imag))
131
+ raise ValueError("Imaginary component {}".format(m))
132
+ covmean = covmean.real
133
+
134
+ tr_covmean = np.trace(covmean)
135
+
136
+ return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
137
+
138
+
139
+ class Evaluator:
140
+ def __init__(
141
+ self,
142
+ session,
143
+ batch_size=64,
144
+ softmax_batch_size=512,
145
+ ):
146
+ self.sess = session
147
+ self.batch_size = batch_size
148
+ self.softmax_batch_size = softmax_batch_size
149
+ self.manifold_estimator = ManifoldEstimator(session)
150
+ with self.sess.graph.as_default():
151
+ self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
152
+ self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
153
+ self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
154
+ self.softmax = _create_softmax_graph(self.softmax_input)
155
+
156
+ def warmup(self):
157
+ self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
158
+
159
+ def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]:
160
+ with open_npz_array(npz_path, "arr_0") as reader:
161
+ return self.compute_activations(reader.read_batches(self.batch_size))
162
+
163
+ def compute_activations(self, batches: Iterable[np.ndarray],silent=False) -> Tuple[np.ndarray, np.ndarray]:
164
+ """
165
+ Compute image features for downstream evals.
166
+
167
+ :param batches: a iterator over NHWC numpy arrays in [0, 255].
168
+ :return: a tuple of numpy arrays of shape [N x X], where X is a feature
169
+ dimension. The tuple is (pool_3, spatial).
170
+ """
171
+ preds = []
172
+ spatial_preds = []
173
+ it = batches if silent else tqdm(batches)
174
+ for batch in it:
175
+ batch = batch.astype(np.float32)
176
+ pred, spatial_pred = self.sess.run(
177
+ [self.pool_features, self.spatial_features], {self.image_input: batch}
178
+ )
179
+ preds.append(pred.reshape([pred.shape[0], -1]))
180
+ spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
181
+ return (
182
+ np.concatenate(preds, axis=0),
183
+ np.concatenate(spatial_preds, axis=0),
184
+ )
185
+
186
+ def read_statistics(
187
+ self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray]
188
+ ) -> Tuple[FIDStatistics, FIDStatistics]:
189
+ obj = np.load(npz_path)
190
+ if "mu" in list(obj.keys()):
191
+ return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
192
+ obj["mu_s"], obj["sigma_s"]
193
+ )
194
+ return tuple(self.compute_statistics(x) for x in activations)
195
+
196
+ def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
197
+ mu = np.mean(activations, axis=0)
198
+ sigma = np.cov(activations, rowvar=False)
199
+ return FIDStatistics(mu, sigma)
200
+
201
+ def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
202
+ softmax_out = []
203
+ for i in range(0, len(activations), self.softmax_batch_size):
204
+ acts = activations[i : i + self.softmax_batch_size]
205
+ softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
206
+ preds = np.concatenate(softmax_out, axis=0)
207
+ # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
208
+ scores = []
209
+ for i in range(0, len(preds), split_size):
210
+ part = preds[i : i + split_size]
211
+ kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
212
+ kl = np.mean(np.sum(kl, 1))
213
+ scores.append(np.exp(kl))
214
+ return float(np.mean(scores))
215
+
216
+ def compute_prec_recall(
217
+ self, activations_ref: np.ndarray, activations_sample: np.ndarray
218
+ ) -> Tuple[float, float]:
219
+ radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
220
+ radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
221
+ pr = self.manifold_estimator.evaluate_pr(
222
+ activations_ref, radii_1, activations_sample, radii_2
223
+ )
224
+ return (float(pr[0][0]), float(pr[1][0]))
225
+
226
+
227
+ class ManifoldEstimator:
228
+ """
229
+ A helper for comparing manifolds of feature vectors.
230
+
231
+ Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
232
+ """
233
+
234
+ def __init__(
235
+ self,
236
+ session,
237
+ row_batch_size=10000,
238
+ col_batch_size=10000,
239
+ nhood_sizes=(3,),
240
+ clamp_to_percentile=None,
241
+ eps=1e-5,
242
+ ):
243
+ """
244
+ Estimate the manifold of given feature vectors.
245
+
246
+ :param session: the TensorFlow session.
247
+ :param row_batch_size: row batch size to compute pairwise distances
248
+ (parameter to trade-off between memory usage and performance).
249
+ :param col_batch_size: column batch size to compute pairwise distances.
250
+ :param nhood_sizes: number of neighbors used to estimate the manifold.
251
+ :param clamp_to_percentile: prune hyperspheres that have radius larger than
252
+ the given percentile.
253
+ :param eps: small number for numerical stability.
254
+ """
255
+ self.distance_block = DistanceBlock(session)
256
+ self.row_batch_size = row_batch_size
257
+ self.col_batch_size = col_batch_size
258
+ self.nhood_sizes = nhood_sizes
259
+ self.num_nhoods = len(nhood_sizes)
260
+ self.clamp_to_percentile = clamp_to_percentile
261
+ self.eps = eps
262
+
263
+ def warmup(self):
264
+ feats, radii = (
265
+ np.zeros([1, 2048], dtype=np.float32),
266
+ np.zeros([1, 1], dtype=np.float32),
267
+ )
268
+ self.evaluate_pr(feats, radii, feats, radii)
269
+
270
+ def manifold_radii(self, features: np.ndarray) -> np.ndarray:
271
+ num_images = len(features)
272
+
273
+ # Estimate manifold of features by calculating distances to k-NN of each sample.
274
+ radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
275
+ distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
276
+ seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
277
+
278
+ for begin1 in range(0, num_images, self.row_batch_size):
279
+ end1 = min(begin1 + self.row_batch_size, num_images)
280
+ row_batch = features[begin1:end1]
281
+
282
+ for begin2 in range(0, num_images, self.col_batch_size):
283
+ end2 = min(begin2 + self.col_batch_size, num_images)
284
+ col_batch = features[begin2:end2]
285
+
286
+ # Compute distances between batches.
287
+ distance_batch[
288
+ 0 : end1 - begin1, begin2:end2
289
+ ] = self.distance_block.pairwise_distances(row_batch, col_batch)
290
+
291
+ # Find the k-nearest neighbor from the current batch.
292
+ radii[begin1:end1, :] = np.concatenate(
293
+ [
294
+ x[:, self.nhood_sizes]
295
+ for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
296
+ ],
297
+ axis=0,
298
+ )
299
+
300
+ if self.clamp_to_percentile is not None:
301
+ max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
302
+ radii[radii > max_distances] = 0
303
+ return radii
304
+
305
+ def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
306
+ """
307
+ Evaluate if new feature vectors are at the manifold.
308
+ """
309
+ num_eval_images = eval_features.shape[0]
310
+ num_ref_images = radii.shape[0]
311
+ distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
312
+ batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
313
+ max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
314
+ nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
315
+
316
+ for begin1 in range(0, num_eval_images, self.row_batch_size):
317
+ end1 = min(begin1 + self.row_batch_size, num_eval_images)
318
+ feature_batch = eval_features[begin1:end1]
319
+
320
+ for begin2 in range(0, num_ref_images, self.col_batch_size):
321
+ end2 = min(begin2 + self.col_batch_size, num_ref_images)
322
+ ref_batch = features[begin2:end2]
323
+
324
+ distance_batch[
325
+ 0 : end1 - begin1, begin2:end2
326
+ ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
327
+
328
+ # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
329
+ # If a feature vector is inside a hypersphere of some reference sample, then
330
+ # the new sample lies at the estimated manifold.
331
+ # The radii of the hyperspheres are determined from distances of neighborhood size k.
332
+ samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
333
+ batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
334
+
335
+ max_realism_score[begin1:end1] = np.max(
336
+ radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
337
+ )
338
+ nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
339
+
340
+ return {
341
+ "fraction": float(np.mean(batch_predictions)),
342
+ "batch_predictions": batch_predictions,
343
+ "max_realisim_score": max_realism_score,
344
+ "nearest_indices": nearest_indices,
345
+ }
346
+
347
+ def evaluate_pr(
348
+ self,
349
+ features_1: np.ndarray,
350
+ radii_1: np.ndarray,
351
+ features_2: np.ndarray,
352
+ radii_2: np.ndarray,
353
+ ) -> Tuple[np.ndarray, np.ndarray]:
354
+ """
355
+ Evaluate precision and recall efficiently.
356
+
357
+ :param features_1: [N1 x D] feature vectors for reference batch.
358
+ :param radii_1: [N1 x K1] radii for reference vectors.
359
+ :param features_2: [N2 x D] feature vectors for the other batch.
360
+ :param radii_2: [N x K2] radii for other vectors.
361
+ :return: a tuple of arrays for (precision, recall):
362
+ - precision: an np.ndarray of length K1
363
+ - recall: an np.ndarray of length K2
364
+ """
365
+ features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool)
366
+ features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool)
367
+ for begin_1 in range(0, len(features_1), self.row_batch_size):
368
+ end_1 = begin_1 + self.row_batch_size
369
+ batch_1 = features_1[begin_1:end_1]
370
+ for begin_2 in range(0, len(features_2), self.col_batch_size):
371
+ end_2 = begin_2 + self.col_batch_size
372
+ batch_2 = features_2[begin_2:end_2]
373
+ batch_1_in, batch_2_in = self.distance_block.less_thans(
374
+ batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
375
+ )
376
+ features_1_status[begin_1:end_1] |= batch_1_in
377
+ features_2_status[begin_2:end_2] |= batch_2_in
378
+ return (
379
+ np.mean(features_2_status.astype(np.float64), axis=0),
380
+ np.mean(features_1_status.astype(np.float64), axis=0),
381
+ )
382
+
383
+
384
+ class DistanceBlock:
385
+ """
386
+ Calculate pairwise distances between vectors.
387
+
388
+ Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
389
+ """
390
+
391
+ def __init__(self, session):
392
+ self.session = session
393
+
394
+ # Initialize TF graph to calculate pairwise distances.
395
+ with session.graph.as_default():
396
+ self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
397
+ self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
398
+ distance_block_16 = _batch_pairwise_distances(
399
+ tf.cast(self._features_batch1, tf.float16),
400
+ tf.cast(self._features_batch2, tf.float16),
401
+ )
402
+ self.distance_block = tf.cond(
403
+ tf.reduce_all(tf.math.is_finite(distance_block_16)),
404
+ lambda: tf.cast(distance_block_16, tf.float32),
405
+ lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
406
+ )
407
+
408
+ # Extra logic for less thans.
409
+ self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
410
+ self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
411
+ dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
412
+ self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
413
+ self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
414
+
415
+ def pairwise_distances(self, U, V):
416
+ """
417
+ Evaluate pairwise distances between two batches of feature vectors.
418
+ """
419
+ return self.session.run(
420
+ self.distance_block,
421
+ feed_dict={self._features_batch1: U, self._features_batch2: V},
422
+ )
423
+
424
+ def less_thans(self, batch_1, radii_1, batch_2, radii_2):
425
+ return self.session.run(
426
+ [self._batch_1_in, self._batch_2_in],
427
+ feed_dict={
428
+ self._features_batch1: batch_1,
429
+ self._features_batch2: batch_2,
430
+ self._radii1: radii_1,
431
+ self._radii2: radii_2,
432
+ },
433
+ )
434
+
435
+
436
+ def _batch_pairwise_distances(U, V):
437
+ """
438
+ Compute pairwise distances between two batches of feature vectors.
439
+ """
440
+ with tf.variable_scope("pairwise_dist_block"):
441
+ # Squared norms of each row in U and V.
442
+ norm_u = tf.reduce_sum(tf.square(U), 1)
443
+ norm_v = tf.reduce_sum(tf.square(V), 1)
444
+
445
+ # norm_u as a column and norm_v as a row vectors.
446
+ norm_u = tf.reshape(norm_u, [-1, 1])
447
+ norm_v = tf.reshape(norm_v, [1, -1])
448
+
449
+ # Pairwise squared Euclidean distances.
450
+ D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
451
+
452
+ return D
453
+
454
+
455
+ class NpzArrayReader(ABC):
456
+ @abstractmethod
457
+ def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
458
+ pass
459
+
460
+ @abstractmethod
461
+ def remaining(self) -> int:
462
+ pass
463
+
464
+ def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
465
+ def gen_fn():
466
+ while True:
467
+ batch = self.read_batch(batch_size)
468
+ if batch is None:
469
+ break
470
+ yield batch
471
+
472
+ rem = self.remaining()
473
+ num_batches = rem // batch_size + int(rem % batch_size != 0)
474
+ return BatchIterator(gen_fn, num_batches)
475
+
476
+
477
+ class BatchIterator:
478
+ def __init__(self, gen_fn, length):
479
+ self.gen_fn = gen_fn
480
+ self.length = length
481
+
482
+ def __len__(self):
483
+ return self.length
484
+
485
+ def __iter__(self):
486
+ return self.gen_fn()
487
+
488
+
489
+ class StreamingNpzArrayReader(NpzArrayReader):
490
+ def __init__(self, arr_f, shape, dtype):
491
+ self.arr_f = arr_f
492
+ self.shape = shape
493
+ self.dtype = dtype
494
+ self.idx = 0
495
+
496
+ def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
497
+ if self.idx >= self.shape[0]:
498
+ return None
499
+
500
+ bs = min(batch_size, self.shape[0] - self.idx)
501
+ self.idx += bs
502
+
503
+ if self.dtype.itemsize == 0:
504
+ return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
505
+
506
+ read_count = bs * np.prod(self.shape[1:])
507
+ read_size = int(read_count * self.dtype.itemsize)
508
+ data = _read_bytes(self.arr_f, read_size, "array data")
509
+ return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
510
+
511
+ def remaining(self) -> int:
512
+ return max(0, self.shape[0] - self.idx)
513
+
514
+
515
+ class MemoryNpzArrayReader(NpzArrayReader):
516
+ def __init__(self, arr):
517
+ self.arr = arr
518
+ self.idx = 0
519
+
520
+ @classmethod
521
+ def load(cls, path: str, arr_name: str):
522
+ with open(path, "rb") as f:
523
+ arr = np.load(f)[arr_name]
524
+ return cls(arr)
525
+
526
+ def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
527
+ if self.idx >= self.arr.shape[0]:
528
+ return None
529
+
530
+ res = self.arr[self.idx : self.idx + batch_size]
531
+ self.idx += batch_size
532
+ return res
533
+
534
+ def remaining(self) -> int:
535
+ return max(0, self.arr.shape[0] - self.idx)
536
+
537
+
538
+ @contextmanager
539
+ def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
540
+ with _open_npy_file(path, arr_name) as arr_f:
541
+ version = np.lib.format.read_magic(arr_f)
542
+ if version == (1, 0):
543
+ header = np.lib.format.read_array_header_1_0(arr_f)
544
+ elif version == (2, 0):
545
+ header = np.lib.format.read_array_header_2_0(arr_f)
546
+ else:
547
+ yield MemoryNpzArrayReader.load(path, arr_name)
548
+ return
549
+ shape, fortran, dtype = header
550
+ if fortran or dtype.hasobject:
551
+ yield MemoryNpzArrayReader.load(path, arr_name)
552
+ else:
553
+ yield StreamingNpzArrayReader(arr_f, shape, dtype)
554
+
555
+
556
+ def _read_bytes(fp, size, error_template="ran out of data"):
557
+ """
558
+ Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
559
+
560
+ Read from file-like object until size bytes are read.
561
+ Raises ValueError if not EOF is encountered before size bytes are read.
562
+ Non-blocking objects only supported if they derive from io objects.
563
+ Required as e.g. ZipExtFile in python 2.6 can return less data than
564
+ requested.
565
+ """
566
+ data = bytes()
567
+ while True:
568
+ # io files (default in python3) return None or raise on
569
+ # would-block, python2 file will truncate, probably nothing can be
570
+ # done about that. note that regular files can't be non-blocking
571
+ try:
572
+ r = fp.read(size - len(data))
573
+ data += r
574
+ if len(r) == 0 or len(data) == size:
575
+ break
576
+ except io.BlockingIOError:
577
+ pass
578
+ if len(data) != size:
579
+ msg = "EOF: reading %s, expected %d bytes got %d"
580
+ raise ValueError(msg % (error_template, size, len(data)))
581
+ else:
582
+ return data
583
+
584
+
585
+ @contextmanager
586
+ def _open_npy_file(path: str, arr_name: str):
587
+ with open(path, "rb") as f:
588
+ with zipfile.ZipFile(f, "r") as zip_f:
589
+ if f"{arr_name}.npy" not in zip_f.namelist():
590
+ raise ValueError(f"missing {arr_name} in npz file")
591
+ with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
592
+ yield arr_f
593
+
594
+
595
+ def _download_inception_model():
596
+ if os.path.exists(INCEPTION_V3_PATH):
597
+ return
598
+ print("downloading InceptionV3 model...")
599
+ with requests.get(INCEPTION_V3_URL, stream=True) as r:
600
+ r.raise_for_status()
601
+ tmp_path = INCEPTION_V3_PATH + ".tmp"
602
+ with open(tmp_path, "wb") as f:
603
+ for chunk in tqdm(r.iter_content(chunk_size=8192)):
604
+ f.write(chunk)
605
+ os.rename(tmp_path, INCEPTION_V3_PATH)
606
+
607
+
608
+ def _create_feature_graph(input_batch):
609
+ _download_inception_model()
610
+ prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
611
+ with open(INCEPTION_V3_PATH, "rb") as f:
612
+ graph_def = tf.GraphDef()
613
+ graph_def.ParseFromString(f.read())
614
+ pool3, spatial = tf.import_graph_def(
615
+ graph_def,
616
+ input_map={f"ExpandDims:0": input_batch},
617
+ return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
618
+ name=prefix,
619
+ )
620
+ _update_shapes(pool3)
621
+ spatial = spatial[..., :7]
622
+ return pool3, spatial
623
+
624
+
625
+ def _create_softmax_graph(input_batch):
626
+ _download_inception_model()
627
+ prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
628
+ with open(INCEPTION_V3_PATH, "rb") as f:
629
+ graph_def = tf.GraphDef()
630
+ graph_def.ParseFromString(f.read())
631
+ (matmul,) = tf.import_graph_def(
632
+ graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
633
+ )
634
+ w = matmul.inputs[1]
635
+ logits = tf.matmul(input_batch, w)
636
+ return tf.nn.softmax(logits)
637
+
638
+
639
+ def _update_shapes(pool3):
640
+ # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
641
+ ops = pool3.graph.get_operations()
642
+ for op in ops:
643
+ for o in op.outputs:
644
+ shape = o.get_shape()
645
+ if shape._dims is not None: # pylint: disable=protected-access
646
+ # shape = [s.value for s in shape] TF 1.x
647
+ shape = [s for s in shape] # TF 2.x
648
+ new_shape = []
649
+ for j, s in enumerate(shape):
650
+ if s == 1 and j == 0:
651
+ new_shape.append(None)
652
+ else:
653
+ new_shape.append(s)
654
+ o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
655
+ return pool3
656
+
657
+
658
+ def _numpy_partition(arr, kth, **kwargs):
659
+ num_workers = min(cpu_count(), len(arr))
660
+ chunk_size = len(arr) // num_workers
661
+ extra = len(arr) % num_workers
662
+
663
+ start_idx = 0
664
+ batches = []
665
+ for i in range(num_workers):
666
+ size = chunk_size + (1 if i < extra else 0)
667
+ batches.append(arr[start_idx : start_idx + size])
668
+ start_idx += size
669
+
670
+ with ThreadPool(num_workers) as pool:
671
+ return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
672
+
673
+
674
+ if __name__ == "__main__":
675
+ print(REQUIREMENTS)
676
+ main()
ldm/modules/evaluate/evaluate_perceptualsim.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from tqdm import tqdm
5
+ from collections import namedtuple
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torchvision.transforms as transforms
10
+ from torchvision import models
11
+ from PIL import Image
12
+
13
+ from ldm.modules.evaluate.ssim import ssim
14
+
15
+
16
+ transform = transforms.Compose([transforms.ToTensor()])
17
+
18
+ def normalize_tensor(in_feat, eps=1e-10):
19
+ norm_factor = torch.sqrt(torch.sum(in_feat ** 2, dim=1)).view(
20
+ in_feat.size()[0], 1, in_feat.size()[2], in_feat.size()[3]
21
+ )
22
+ return in_feat / (norm_factor.expand_as(in_feat) + eps)
23
+
24
+
25
+ def cos_sim(in0, in1):
26
+ in0_norm = normalize_tensor(in0)
27
+ in1_norm = normalize_tensor(in1)
28
+ N = in0.size()[0]
29
+ X = in0.size()[2]
30
+ Y = in0.size()[3]
31
+
32
+ return torch.mean(
33
+ torch.mean(
34
+ torch.sum(in0_norm * in1_norm, dim=1).view(N, 1, X, Y), dim=2
35
+ ).view(N, 1, 1, Y),
36
+ dim=3,
37
+ ).view(N)
38
+
39
+
40
+ class squeezenet(torch.nn.Module):
41
+ def __init__(self, requires_grad=False, pretrained=True):
42
+ super(squeezenet, self).__init__()
43
+ pretrained_features = models.squeezenet1_1(
44
+ pretrained=pretrained
45
+ ).features
46
+ self.slice1 = torch.nn.Sequential()
47
+ self.slice2 = torch.nn.Sequential()
48
+ self.slice3 = torch.nn.Sequential()
49
+ self.slice4 = torch.nn.Sequential()
50
+ self.slice5 = torch.nn.Sequential()
51
+ self.slice6 = torch.nn.Sequential()
52
+ self.slice7 = torch.nn.Sequential()
53
+ self.N_slices = 7
54
+ for x in range(2):
55
+ self.slice1.add_module(str(x), pretrained_features[x])
56
+ for x in range(2, 5):
57
+ self.slice2.add_module(str(x), pretrained_features[x])
58
+ for x in range(5, 8):
59
+ self.slice3.add_module(str(x), pretrained_features[x])
60
+ for x in range(8, 10):
61
+ self.slice4.add_module(str(x), pretrained_features[x])
62
+ for x in range(10, 11):
63
+ self.slice5.add_module(str(x), pretrained_features[x])
64
+ for x in range(11, 12):
65
+ self.slice6.add_module(str(x), pretrained_features[x])
66
+ for x in range(12, 13):
67
+ self.slice7.add_module(str(x), pretrained_features[x])
68
+ if not requires_grad:
69
+ for param in self.parameters():
70
+ param.requires_grad = False
71
+
72
+ def forward(self, X):
73
+ h = self.slice1(X)
74
+ h_relu1 = h
75
+ h = self.slice2(h)
76
+ h_relu2 = h
77
+ h = self.slice3(h)
78
+ h_relu3 = h
79
+ h = self.slice4(h)
80
+ h_relu4 = h
81
+ h = self.slice5(h)
82
+ h_relu5 = h
83
+ h = self.slice6(h)
84
+ h_relu6 = h
85
+ h = self.slice7(h)
86
+ h_relu7 = h
87
+ vgg_outputs = namedtuple(
88
+ "SqueezeOutputs",
89
+ ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"],
90
+ )
91
+ out = vgg_outputs(
92
+ h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7
93
+ )
94
+
95
+ return out
96
+
97
+
98
+ class alexnet(torch.nn.Module):
99
+ def __init__(self, requires_grad=False, pretrained=True):
100
+ super(alexnet, self).__init__()
101
+ alexnet_pretrained_features = models.alexnet(
102
+ pretrained=pretrained
103
+ ).features
104
+ self.slice1 = torch.nn.Sequential()
105
+ self.slice2 = torch.nn.Sequential()
106
+ self.slice3 = torch.nn.Sequential()
107
+ self.slice4 = torch.nn.Sequential()
108
+ self.slice5 = torch.nn.Sequential()
109
+ self.N_slices = 5
110
+ for x in range(2):
111
+ self.slice1.add_module(str(x), alexnet_pretrained_features[x])
112
+ for x in range(2, 5):
113
+ self.slice2.add_module(str(x), alexnet_pretrained_features[x])
114
+ for x in range(5, 8):
115
+ self.slice3.add_module(str(x), alexnet_pretrained_features[x])
116
+ for x in range(8, 10):
117
+ self.slice4.add_module(str(x), alexnet_pretrained_features[x])
118
+ for x in range(10, 12):
119
+ self.slice5.add_module(str(x), alexnet_pretrained_features[x])
120
+ if not requires_grad:
121
+ for param in self.parameters():
122
+ param.requires_grad = False
123
+
124
+ def forward(self, X):
125
+ h = self.slice1(X)
126
+ h_relu1 = h
127
+ h = self.slice2(h)
128
+ h_relu2 = h
129
+ h = self.slice3(h)
130
+ h_relu3 = h
131
+ h = self.slice4(h)
132
+ h_relu4 = h
133
+ h = self.slice5(h)
134
+ h_relu5 = h
135
+ alexnet_outputs = namedtuple(
136
+ "AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"]
137
+ )
138
+ out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)
139
+
140
+ return out
141
+
142
+
143
+ class vgg16(torch.nn.Module):
144
+ def __init__(self, requires_grad=False, pretrained=True):
145
+ super(vgg16, self).__init__()
146
+ vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
147
+ self.slice1 = torch.nn.Sequential()
148
+ self.slice2 = torch.nn.Sequential()
149
+ self.slice3 = torch.nn.Sequential()
150
+ self.slice4 = torch.nn.Sequential()
151
+ self.slice5 = torch.nn.Sequential()
152
+ self.N_slices = 5
153
+ for x in range(4):
154
+ self.slice1.add_module(str(x), vgg_pretrained_features[x])
155
+ for x in range(4, 9):
156
+ self.slice2.add_module(str(x), vgg_pretrained_features[x])
157
+ for x in range(9, 16):
158
+ self.slice3.add_module(str(x), vgg_pretrained_features[x])
159
+ for x in range(16, 23):
160
+ self.slice4.add_module(str(x), vgg_pretrained_features[x])
161
+ for x in range(23, 30):
162
+ self.slice5.add_module(str(x), vgg_pretrained_features[x])
163
+ if not requires_grad:
164
+ for param in self.parameters():
165
+ param.requires_grad = False
166
+
167
+ def forward(self, X):
168
+ h = self.slice1(X)
169
+ h_relu1_2 = h
170
+ h = self.slice2(h)
171
+ h_relu2_2 = h
172
+ h = self.slice3(h)
173
+ h_relu3_3 = h
174
+ h = self.slice4(h)
175
+ h_relu4_3 = h
176
+ h = self.slice5(h)
177
+ h_relu5_3 = h
178
+ vgg_outputs = namedtuple(
179
+ "VggOutputs",
180
+ ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"],
181
+ )
182
+ out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
183
+
184
+ return out
185
+
186
+
187
+ class resnet(torch.nn.Module):
188
+ def __init__(self, requires_grad=False, pretrained=True, num=18):
189
+ super(resnet, self).__init__()
190
+ if num == 18:
191
+ self.net = models.resnet18(pretrained=pretrained)
192
+ elif num == 34:
193
+ self.net = models.resnet34(pretrained=pretrained)
194
+ elif num == 50:
195
+ self.net = models.resnet50(pretrained=pretrained)
196
+ elif num == 101:
197
+ self.net = models.resnet101(pretrained=pretrained)
198
+ elif num == 152:
199
+ self.net = models.resnet152(pretrained=pretrained)
200
+ self.N_slices = 5
201
+
202
+ self.conv1 = self.net.conv1
203
+ self.bn1 = self.net.bn1
204
+ self.relu = self.net.relu
205
+ self.maxpool = self.net.maxpool
206
+ self.layer1 = self.net.layer1
207
+ self.layer2 = self.net.layer2
208
+ self.layer3 = self.net.layer3
209
+ self.layer4 = self.net.layer4
210
+
211
+ def forward(self, X):
212
+ h = self.conv1(X)
213
+ h = self.bn1(h)
214
+ h = self.relu(h)
215
+ h_relu1 = h
216
+ h = self.maxpool(h)
217
+ h = self.layer1(h)
218
+ h_conv2 = h
219
+ h = self.layer2(h)
220
+ h_conv3 = h
221
+ h = self.layer3(h)
222
+ h_conv4 = h
223
+ h = self.layer4(h)
224
+ h_conv5 = h
225
+
226
+ outputs = namedtuple(
227
+ "Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"]
228
+ )
229
+ out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)
230
+
231
+ return out
232
+
233
+ # Off-the-shelf deep network
234
+ class PNet(torch.nn.Module):
235
+ """Pre-trained network with all channels equally weighted by default"""
236
+
237
+ def __init__(self, pnet_type="vgg", pnet_rand=False, use_gpu=True):
238
+ super(PNet, self).__init__()
239
+
240
+ self.use_gpu = use_gpu
241
+
242
+ self.pnet_type = pnet_type
243
+ self.pnet_rand = pnet_rand
244
+
245
+ self.shift = torch.Tensor([-0.030, -0.088, -0.188]).view(1, 3, 1, 1)
246
+ self.scale = torch.Tensor([0.458, 0.448, 0.450]).view(1, 3, 1, 1)
247
+
248
+ if self.pnet_type in ["vgg", "vgg16"]:
249
+ self.net = vgg16(pretrained=not self.pnet_rand, requires_grad=False)
250
+ elif self.pnet_type == "alex":
251
+ self.net = alexnet(
252
+ pretrained=not self.pnet_rand, requires_grad=False
253
+ )
254
+ elif self.pnet_type[:-2] == "resnet":
255
+ self.net = resnet(
256
+ pretrained=not self.pnet_rand,
257
+ requires_grad=False,
258
+ num=int(self.pnet_type[-2:]),
259
+ )
260
+ elif self.pnet_type == "squeeze":
261
+ self.net = squeezenet(
262
+ pretrained=not self.pnet_rand, requires_grad=False
263
+ )
264
+
265
+ self.L = self.net.N_slices
266
+
267
+ if use_gpu:
268
+ self.net.cuda()
269
+ self.shift = self.shift.cuda()
270
+ self.scale = self.scale.cuda()
271
+
272
+ def forward(self, in0, in1, retPerLayer=False):
273
+ in0_sc = (in0 - self.shift.expand_as(in0)) / self.scale.expand_as(in0)
274
+ in1_sc = (in1 - self.shift.expand_as(in0)) / self.scale.expand_as(in0)
275
+
276
+ outs0 = self.net.forward(in0_sc)
277
+ outs1 = self.net.forward(in1_sc)
278
+
279
+ if retPerLayer:
280
+ all_scores = []
281
+ for (kk, out0) in enumerate(outs0):
282
+ cur_score = 1.0 - cos_sim(outs0[kk], outs1[kk])
283
+ if kk == 0:
284
+ val = 1.0 * cur_score
285
+ else:
286
+ val = val + cur_score
287
+ if retPerLayer:
288
+ all_scores += [cur_score]
289
+
290
+ if retPerLayer:
291
+ return (val, all_scores)
292
+ else:
293
+ return val
294
+
295
+
296
+
297
+
298
+ # The SSIM metric
299
+ def ssim_metric(img1, img2, mask=None):
300
+ return ssim(img1, img2, mask=mask, size_average=False)
301
+
302
+
303
+ # The PSNR metric
304
+ def psnr(img1, img2, mask=None,reshape=False):
305
+ b = img1.size(0)
306
+ if not (mask is None):
307
+ b = img1.size(0)
308
+ mse_err = (img1 - img2).pow(2) * mask
309
+ if reshape:
310
+ mse_err = mse_err.reshape(b, -1).sum(dim=1) / (
311
+ 3 * mask.reshape(b, -1).sum(dim=1).clamp(min=1)
312
+ )
313
+ else:
314
+ mse_err = mse_err.view(b, -1).sum(dim=1) / (
315
+ 3 * mask.view(b, -1).sum(dim=1).clamp(min=1)
316
+ )
317
+ else:
318
+ if reshape:
319
+ mse_err = (img1 - img2).pow(2).reshape(b, -1).mean(dim=1)
320
+ else:
321
+ mse_err = (img1 - img2).pow(2).view(b, -1).mean(dim=1)
322
+
323
+ psnr = 10 * (1 / mse_err).log10()
324
+ return psnr
325
+
326
+
327
+ # The perceptual similarity metric
328
+ def perceptual_sim(img1, img2, vgg16):
329
+ # First extract features
330
+ dist = vgg16(img1 * 2 - 1, img2 * 2 - 1)
331
+
332
+ return dist
333
+
334
+ def load_img(img_name, size=None):
335
+ try:
336
+ img = Image.open(img_name)
337
+
338
+ if type(size) == int:
339
+ img = img.resize((size, size))
340
+ elif size is not None:
341
+ img = img.resize((size[1], size[0]))
342
+
343
+ img = transform(img).cuda()
344
+ img = img.unsqueeze(0)
345
+ except Exception as e:
346
+ print("Failed at loading %s " % img_name)
347
+ print(e)
348
+ img = torch.zeros(1, 3, 256, 256).cuda()
349
+ raise
350
+ return img
351
+
352
+
353
+ def compute_perceptual_similarity(folder, pred_img, tgt_img, take_every_other):
354
+
355
+ # Load VGG16 for feature similarity
356
+ vgg16 = PNet().to("cuda")
357
+ vgg16.eval()
358
+ vgg16.cuda()
359
+
360
+ values_percsim = []
361
+ values_ssim = []
362
+ values_psnr = []
363
+ folders = os.listdir(folder)
364
+ for i, f in tqdm(enumerate(sorted(folders))):
365
+ pred_imgs = glob.glob(folder + f + "/" + pred_img)
366
+ tgt_imgs = glob.glob(folder + f + "/" + tgt_img)
367
+ assert len(tgt_imgs) == 1
368
+
369
+ perc_sim = 10000
370
+ ssim_sim = -10
371
+ psnr_sim = -10
372
+ for p_img in pred_imgs:
373
+ t_img = load_img(tgt_imgs[0])
374
+ p_img = load_img(p_img, size=t_img.shape[2:])
375
+ t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
376
+ perc_sim = min(perc_sim, t_perc_sim)
377
+
378
+ ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item())
379
+ psnr_sim = max(psnr_sim, psnr(p_img, t_img).item())
380
+
381
+ values_percsim += [perc_sim]
382
+ values_ssim += [ssim_sim]
383
+ values_psnr += [psnr_sim]
384
+
385
+ if take_every_other:
386
+ n_valuespercsim = []
387
+ n_valuesssim = []
388
+ n_valuespsnr = []
389
+ for i in range(0, len(values_percsim) // 2):
390
+ n_valuespercsim += [
391
+ min(values_percsim[2 * i], values_percsim[2 * i + 1])
392
+ ]
393
+ n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
394
+ n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
395
+
396
+ values_percsim = n_valuespercsim
397
+ values_ssim = n_valuesssim
398
+ values_psnr = n_valuespsnr
399
+
400
+ avg_percsim = np.mean(np.array(values_percsim))
401
+ std_percsim = np.std(np.array(values_percsim))
402
+
403
+ avg_psnr = np.mean(np.array(values_psnr))
404
+ std_psnr = np.std(np.array(values_psnr))
405
+
406
+ avg_ssim = np.mean(np.array(values_ssim))
407
+ std_ssim = np.std(np.array(values_ssim))
408
+
409
+ return {
410
+ "Perceptual similarity": (avg_percsim, std_percsim),
411
+ "PSNR": (avg_psnr, std_psnr),
412
+ "SSIM": (avg_ssim, std_ssim),
413
+ }
414
+
415
+
416
+ def compute_perceptual_similarity_from_list(pred_imgs_list, tgt_imgs_list,
417
+ take_every_other,
418
+ simple_format=True):
419
+
420
+ # Load VGG16 for feature similarity
421
+ vgg16 = PNet().to("cuda")
422
+ vgg16.eval()
423
+ vgg16.cuda()
424
+
425
+ values_percsim = []
426
+ values_ssim = []
427
+ values_psnr = []
428
+ equal_count = 0
429
+ ambig_count = 0
430
+ for i, tgt_img in enumerate(tqdm(tgt_imgs_list)):
431
+ pred_imgs = pred_imgs_list[i]
432
+ tgt_imgs = [tgt_img]
433
+ assert len(tgt_imgs) == 1
434
+
435
+ if type(pred_imgs) != list:
436
+ pred_imgs = [pred_imgs]
437
+
438
+ perc_sim = 10000
439
+ ssim_sim = -10
440
+ psnr_sim = -10
441
+ assert len(pred_imgs)>0
442
+ for p_img in pred_imgs:
443
+ t_img = load_img(tgt_imgs[0])
444
+ p_img = load_img(p_img, size=t_img.shape[2:])
445
+ t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
446
+ perc_sim = min(perc_sim, t_perc_sim)
447
+
448
+ ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item())
449
+ psnr_sim = max(psnr_sim, psnr(p_img, t_img).item())
450
+
451
+ values_percsim += [perc_sim]
452
+ values_ssim += [ssim_sim]
453
+ if psnr_sim != np.float("inf"):
454
+ values_psnr += [psnr_sim]
455
+ else:
456
+ if torch.allclose(p_img, t_img):
457
+ equal_count += 1
458
+ print("{} equal src and wrp images.".format(equal_count))
459
+ else:
460
+ ambig_count += 1
461
+ print("{} ambiguous src and wrp images.".format(ambig_count))
462
+
463
+ if take_every_other:
464
+ n_valuespercsim = []
465
+ n_valuesssim = []
466
+ n_valuespsnr = []
467
+ for i in range(0, len(values_percsim) // 2):
468
+ n_valuespercsim += [
469
+ min(values_percsim[2 * i], values_percsim[2 * i + 1])
470
+ ]
471
+ n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
472
+ n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
473
+
474
+ values_percsim = n_valuespercsim
475
+ values_ssim = n_valuesssim
476
+ values_psnr = n_valuespsnr
477
+
478
+ avg_percsim = np.mean(np.array(values_percsim))
479
+ std_percsim = np.std(np.array(values_percsim))
480
+
481
+ avg_psnr = np.mean(np.array(values_psnr))
482
+ std_psnr = np.std(np.array(values_psnr))
483
+
484
+ avg_ssim = np.mean(np.array(values_ssim))
485
+ std_ssim = np.std(np.array(values_ssim))
486
+
487
+ if simple_format:
488
+ # just to make yaml formatting readable
489
+ return {
490
+ "Perceptual similarity": [float(avg_percsim), float(std_percsim)],
491
+ "PSNR": [float(avg_psnr), float(std_psnr)],
492
+ "SSIM": [float(avg_ssim), float(std_ssim)],
493
+ }
494
+ else:
495
+ return {
496
+ "Perceptual similarity": (avg_percsim, std_percsim),
497
+ "PSNR": (avg_psnr, std_psnr),
498
+ "SSIM": (avg_ssim, std_ssim),
499
+ }
500
+
501
+
502
+ def compute_perceptual_similarity_from_list_topk(pred_imgs_list, tgt_imgs_list,
503
+ take_every_other, resize=False):
504
+
505
+ # Load VGG16 for feature similarity
506
+ vgg16 = PNet().to("cuda")
507
+ vgg16.eval()
508
+ vgg16.cuda()
509
+
510
+ values_percsim = []
511
+ values_ssim = []
512
+ values_psnr = []
513
+ individual_percsim = []
514
+ individual_ssim = []
515
+ individual_psnr = []
516
+ for i, tgt_img in enumerate(tqdm(tgt_imgs_list)):
517
+ pred_imgs = pred_imgs_list[i]
518
+ tgt_imgs = [tgt_img]
519
+ assert len(tgt_imgs) == 1
520
+
521
+ if type(pred_imgs) != list:
522
+ assert False
523
+ pred_imgs = [pred_imgs]
524
+
525
+ perc_sim = 10000
526
+ ssim_sim = -10
527
+ psnr_sim = -10
528
+ sample_percsim = list()
529
+ sample_ssim = list()
530
+ sample_psnr = list()
531
+ for p_img in pred_imgs:
532
+ if resize:
533
+ t_img = load_img(tgt_imgs[0], size=(256,256))
534
+ else:
535
+ t_img = load_img(tgt_imgs[0])
536
+ p_img = load_img(p_img, size=t_img.shape[2:])
537
+
538
+ t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
539
+ sample_percsim.append(t_perc_sim)
540
+ perc_sim = min(perc_sim, t_perc_sim)
541
+
542
+ t_ssim = ssim_metric(p_img, t_img).item()
543
+ sample_ssim.append(t_ssim)
544
+ ssim_sim = max(ssim_sim, t_ssim)
545
+
546
+ t_psnr = psnr(p_img, t_img).item()
547
+ sample_psnr.append(t_psnr)
548
+ psnr_sim = max(psnr_sim, t_psnr)
549
+
550
+ values_percsim += [perc_sim]
551
+ values_ssim += [ssim_sim]
552
+ values_psnr += [psnr_sim]
553
+ individual_percsim.append(sample_percsim)
554
+ individual_ssim.append(sample_ssim)
555
+ individual_psnr.append(sample_psnr)
556
+
557
+ if take_every_other:
558
+ assert False, "Do this later, after specifying topk to get proper results"
559
+ n_valuespercsim = []
560
+ n_valuesssim = []
561
+ n_valuespsnr = []
562
+ for i in range(0, len(values_percsim) // 2):
563
+ n_valuespercsim += [
564
+ min(values_percsim[2 * i], values_percsim[2 * i + 1])
565
+ ]
566
+ n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
567
+ n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
568
+
569
+ values_percsim = n_valuespercsim
570
+ values_ssim = n_valuesssim
571
+ values_psnr = n_valuespsnr
572
+
573
+ avg_percsim = np.mean(np.array(values_percsim))
574
+ std_percsim = np.std(np.array(values_percsim))
575
+
576
+ avg_psnr = np.mean(np.array(values_psnr))
577
+ std_psnr = np.std(np.array(values_psnr))
578
+
579
+ avg_ssim = np.mean(np.array(values_ssim))
580
+ std_ssim = np.std(np.array(values_ssim))
581
+
582
+ individual_percsim = np.array(individual_percsim)
583
+ individual_psnr = np.array(individual_psnr)
584
+ individual_ssim = np.array(individual_ssim)
585
+
586
+ return {
587
+ "avg_of_best": {
588
+ "Perceptual similarity": [float(avg_percsim), float(std_percsim)],
589
+ "PSNR": [float(avg_psnr), float(std_psnr)],
590
+ "SSIM": [float(avg_ssim), float(std_ssim)],
591
+ },
592
+ "individual": {
593
+ "PSIM": individual_percsim,
594
+ "PSNR": individual_psnr,
595
+ "SSIM": individual_ssim,
596
+ }
597
+ }
598
+
599
+
600
+ if __name__ == "__main__":
601
+ args = argparse.ArgumentParser()
602
+ args.add_argument("--folder", type=str, default="")
603
+ args.add_argument("--pred_image", type=str, default="")
604
+ args.add_argument("--target_image", type=str, default="")
605
+ args.add_argument("--take_every_other", action="store_true", default=False)
606
+ args.add_argument("--output_file", type=str, default="")
607
+
608
+ opts = args.parse_args()
609
+
610
+ folder = opts.folder
611
+ pred_img = opts.pred_image
612
+ tgt_img = opts.target_image
613
+
614
+ results = compute_perceptual_similarity(
615
+ folder, pred_img, tgt_img, opts.take_every_other
616
+ )
617
+
618
+ f = open(opts.output_file, 'w')
619
+ for key in results:
620
+ print("%s for %s: \n" % (key, opts.folder))
621
+ print(
622
+ "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1])
623
+ )
624
+
625
+ f.write("%s for %s: \n" % (key, opts.folder))
626
+ f.write(
627
+ "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1])
628
+ )
629
+
630
+ f.close()
ldm/modules/evaluate/frechet_video_distance.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The Google Research Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Lint as: python2, python3
17
+ """Minimal Reference implementation for the Frechet Video Distance (FVD).
18
+
19
+ FVD is a metric for the quality of video generation models. It is inspired by
20
+ the FID (Frechet Inception Distance) used for images, but uses a different
21
+ embedding to be better suitable for videos.
22
+ """
23
+
24
+ from __future__ import absolute_import
25
+ from __future__ import division
26
+ from __future__ import print_function
27
+
28
+
29
+ import six
30
+ import tensorflow.compat.v1 as tf
31
+ import tensorflow_gan as tfgan
32
+ import tensorflow_hub as hub
33
+
34
+
35
+ def preprocess(videos, target_resolution):
36
+ """Runs some preprocessing on the videos for I3D model.
37
+
38
+ Args:
39
+ videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
40
+ preprocessed. We don't care about the specific dtype of the videos, it can
41
+ be anything that tf.image.resize_bilinear accepts. Values are expected to
42
+ be in the range 0-255.
43
+ target_resolution: (width, height): target video resolution
44
+
45
+ Returns:
46
+ videos: <float32>[batch_size, num_frames, height, width, depth]
47
+ """
48
+ videos_shape = list(videos.shape)
49
+ all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])
50
+ resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)
51
+ target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]
52
+ output_videos = tf.reshape(resized_videos, target_shape)
53
+ scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1
54
+ return scaled_videos
55
+
56
+
57
+ def _is_in_graph(tensor_name):
58
+ """Checks whether a given tensor does exists in the graph."""
59
+ try:
60
+ tf.get_default_graph().get_tensor_by_name(tensor_name)
61
+ except KeyError:
62
+ return False
63
+ return True
64
+
65
+
66
+ def create_id3_embedding(videos,warmup=False,batch_size=16):
67
+ """Embeds the given videos using the Inflated 3D Convolution ne twork.
68
+
69
+ Downloads the graph of the I3D from tf.hub and adds it to the graph on the
70
+ first call.
71
+
72
+ Args:
73
+ videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
74
+ Expected range is [-1, 1].
75
+
76
+ Returns:
77
+ embedding: <float32>[batch_size, embedding_size]. embedding_size depends
78
+ on the model used.
79
+
80
+ Raises:
81
+ ValueError: when a provided embedding_layer is not supported.
82
+ """
83
+
84
+ # batch_size = 16
85
+ module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"
86
+
87
+
88
+ # Making sure that we import the graph separately for
89
+ # each different input video tensor.
90
+ module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
91
+ videos.name).replace(":", "_")
92
+
93
+
94
+
95
+ assert_ops = [
96
+ tf.Assert(
97
+ tf.reduce_max(videos) <= 1.001,
98
+ ["max value in frame is > 1", videos]),
99
+ tf.Assert(
100
+ tf.reduce_min(videos) >= -1.001,
101
+ ["min value in frame is < -1", videos]),
102
+ tf.assert_equal(
103
+ tf.shape(videos)[0],
104
+ batch_size, ["invalid frame batch size: ",
105
+ tf.shape(videos)],
106
+ summarize=6),
107
+ ]
108
+ with tf.control_dependencies(assert_ops):
109
+ videos = tf.identity(videos)
110
+
111
+ module_scope = "%s_apply_default/" % module_name
112
+
113
+ # To check whether the module has already been loaded into the graph, we look
114
+ # for a given tensor name. If this tensor name exists, we assume the function
115
+ # has been called before and the graph was imported. Otherwise we import it.
116
+ # Note: in theory, the tensor could exist, but have wrong shapes.
117
+ # This will happen if create_id3_embedding is called with a frames_placehoder
118
+ # of wrong size/batch size, because even though that will throw a tf.Assert
119
+ # on graph-execution time, it will insert the tensor (with wrong shape) into
120
+ # the graph. This is why we need the following assert.
121
+ if warmup:
122
+ video_batch_size = int(videos.shape[0])
123
+ assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}"
124
+ tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
125
+ if not _is_in_graph(tensor_name):
126
+ i3d_model = hub.Module(module_spec, name=module_name)
127
+ i3d_model(videos)
128
+
129
+ # gets the kinetics-i3d-400-logits layer
130
+ tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
131
+ tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
132
+ return tensor
133
+
134
+
135
+ def calculate_fvd(real_activations,
136
+ generated_activations):
137
+ """Returns a list of ops that compute metrics as funcs of activations.
138
+
139
+ Args:
140
+ real_activations: <float32>[num_samples, embedding_size]
141
+ generated_activations: <float32>[num_samples, embedding_size]
142
+
143
+ Returns:
144
+ A scalar that contains the requested FVD.
145
+ """
146
+ return tfgan.eval.frechet_classifier_distance_from_activations(
147
+ real_activations, generated_activations)
ldm/modules/evaluate/ssim.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT Licence
2
+
3
+ # Methods to predict the SSIM, taken from
4
+ # https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
5
+
6
+ from math import exp
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch.autograd import Variable
11
+
12
+ def gaussian(window_size, sigma):
13
+ gauss = torch.Tensor(
14
+ [
15
+ exp(-((x - window_size // 2) ** 2) / float(2 * sigma ** 2))
16
+ for x in range(window_size)
17
+ ]
18
+ )
19
+ return gauss / gauss.sum()
20
+
21
+
22
+ def create_window(window_size, channel):
23
+ _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
24
+ _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
25
+ window = Variable(
26
+ _2D_window.expand(channel, 1, window_size, window_size).contiguous()
27
+ )
28
+ return window
29
+
30
+
31
+ def _ssim(
32
+ img1, img2, window, window_size, channel, mask=None, size_average=True
33
+ ):
34
+ mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
35
+ mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
36
+
37
+ mu1_sq = mu1.pow(2)
38
+ mu2_sq = mu2.pow(2)
39
+ mu1_mu2 = mu1 * mu2
40
+
41
+ sigma1_sq = (
42
+ F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel)
43
+ - mu1_sq
44
+ )
45
+ sigma2_sq = (
46
+ F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel)
47
+ - mu2_sq
48
+ )
49
+ sigma12 = (
50
+ F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
51
+ - mu1_mu2
52
+ )
53
+
54
+ C1 = (0.01) ** 2
55
+ C2 = (0.03) ** 2
56
+
57
+ ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
58
+ (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
59
+ )
60
+
61
+ if not (mask is None):
62
+ b = mask.size(0)
63
+ ssim_map = ssim_map.mean(dim=1, keepdim=True) * mask
64
+ ssim_map = ssim_map.view(b, -1).sum(dim=1) / mask.view(b, -1).sum(
65
+ dim=1
66
+ ).clamp(min=1)
67
+ return ssim_map
68
+
69
+ import pdb
70
+
71
+ pdb.set_trace
72
+
73
+ if size_average:
74
+ return ssim_map.mean()
75
+ else:
76
+ return ssim_map.mean(1).mean(1).mean(1)
77
+
78
+
79
+ class SSIM(torch.nn.Module):
80
+ def __init__(self, window_size=11, size_average=True):
81
+ super(SSIM, self).__init__()
82
+ self.window_size = window_size
83
+ self.size_average = size_average
84
+ self.channel = 1
85
+ self.window = create_window(window_size, self.channel)
86
+
87
+ def forward(self, img1, img2, mask=None):
88
+ (_, channel, _, _) = img1.size()
89
+
90
+ if (
91
+ channel == self.channel
92
+ and self.window.data.type() == img1.data.type()
93
+ ):
94
+ window = self.window
95
+ else:
96
+ window = create_window(self.window_size, channel)
97
+
98
+ if img1.is_cuda:
99
+ window = window.cuda(img1.get_device())
100
+ window = window.type_as(img1)
101
+
102
+ self.window = window
103
+ self.channel = channel
104
+
105
+ return _ssim(
106
+ img1,
107
+ img2,
108
+ window,
109
+ self.window_size,
110
+ channel,
111
+ mask,
112
+ self.size_average,
113
+ )
114
+
115
+
116
+ def ssim(img1, img2, window_size=11, mask=None, size_average=True):
117
+ (_, channel, _, _) = img1.size()
118
+ window = create_window(window_size, channel)
119
+
120
+ if img1.is_cuda:
121
+ window = window.cuda(img1.get_device())
122
+ window = window.type_as(img1)
123
+
124
+ return _ssim(img1, img2, window, window_size, channel, mask, size_average)
ldm/modules/evaluate/torch_frechet_video_distance.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # based on https://github.com/universome/fvd-comparison/blob/master/compare_models.py; huge thanks!
2
+ import os
3
+ import numpy as np
4
+ import io
5
+ import re
6
+ import requests
7
+ import html
8
+ import hashlib
9
+ import urllib
10
+ import urllib.request
11
+ import scipy.linalg
12
+ import multiprocessing as mp
13
+ import glob
14
+
15
+
16
+ from tqdm import tqdm
17
+ from typing import Any, List, Tuple, Union, Dict, Callable
18
+
19
+ from torchvision.io import read_video
20
+ import torch; torch.set_grad_enabled(False)
21
+ from einops import rearrange
22
+
23
+ from nitro.util import isvideo
24
+
25
+ def compute_frechet_distance(mu_sample,sigma_sample,mu_ref,sigma_ref) -> float:
26
+ print('Calculate frechet distance...')
27
+ m = np.square(mu_sample - mu_ref).sum()
28
+ s, _ = scipy.linalg.sqrtm(np.dot(sigma_sample, sigma_ref), disp=False) # pylint: disable=no-member
29
+ fid = np.real(m + np.trace(sigma_sample + sigma_ref - s * 2))
30
+
31
+ return float(fid)
32
+
33
+
34
+ def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
35
+ mu = feats.mean(axis=0) # [d]
36
+ sigma = np.cov(feats, rowvar=False) # [d, d]
37
+
38
+ return mu, sigma
39
+
40
+
41
+ def open_url(url: str, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False) -> Any:
42
+ """Download the given URL and return a binary-mode file object to access the data."""
43
+ assert num_attempts >= 1
44
+
45
+ # Doesn't look like an URL scheme so interpret it as a local filename.
46
+ if not re.match('^[a-z]+://', url):
47
+ return url if return_filename else open(url, "rb")
48
+
49
+ # Handle file URLs. This code handles unusual file:// patterns that
50
+ # arise on Windows:
51
+ #
52
+ # file:///c:/foo.txt
53
+ #
54
+ # which would translate to a local '/c:/foo.txt' filename that's
55
+ # invalid. Drop the forward slash for such pathnames.
56
+ #
57
+ # If you touch this code path, you should test it on both Linux and
58
+ # Windows.
59
+ #
60
+ # Some internet resources suggest using urllib.request.url2pathname() but
61
+ # but that converts forward slashes to backslashes and this causes
62
+ # its own set of problems.
63
+ if url.startswith('file://'):
64
+ filename = urllib.parse.urlparse(url).path
65
+ if re.match(r'^/[a-zA-Z]:', filename):
66
+ filename = filename[1:]
67
+ return filename if return_filename else open(filename, "rb")
68
+
69
+ url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
70
+
71
+ # Download.
72
+ url_name = None
73
+ url_data = None
74
+ with requests.Session() as session:
75
+ if verbose:
76
+ print("Downloading %s ..." % url, end="", flush=True)
77
+ for attempts_left in reversed(range(num_attempts)):
78
+ try:
79
+ with session.get(url) as res:
80
+ res.raise_for_status()
81
+ if len(res.content) == 0:
82
+ raise IOError("No data received")
83
+
84
+ if len(res.content) < 8192:
85
+ content_str = res.content.decode("utf-8")
86
+ if "download_warning" in res.headers.get("Set-Cookie", ""):
87
+ links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
88
+ if len(links) == 1:
89
+ url = requests.compat.urljoin(url, links[0])
90
+ raise IOError("Google Drive virus checker nag")
91
+ if "Google Drive - Quota exceeded" in content_str:
92
+ raise IOError("Google Drive download quota exceeded -- please try again later")
93
+
94
+ match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
95
+ url_name = match[1] if match else url
96
+ url_data = res.content
97
+ if verbose:
98
+ print(" done")
99
+ break
100
+ except KeyboardInterrupt:
101
+ raise
102
+ except:
103
+ if not attempts_left:
104
+ if verbose:
105
+ print(" failed")
106
+ raise
107
+ if verbose:
108
+ print(".", end="", flush=True)
109
+
110
+ # Return data as file object.
111
+ assert not return_filename
112
+ return io.BytesIO(url_data)
113
+
114
+ def load_video(ip):
115
+ vid, *_ = read_video(ip)
116
+ vid = rearrange(vid, 't h w c -> t c h w').to(torch.uint8)
117
+ return vid
118
+
119
+ def get_data_from_str(input_str,nprc = None):
120
+ assert os.path.isdir(input_str), f'Specified input folder "{input_str}" is not a directory'
121
+ vid_filelist = glob.glob(os.path.join(input_str,'*.mp4'))
122
+ print(f'Found {len(vid_filelist)} videos in dir {input_str}')
123
+
124
+ if nprc is None:
125
+ try:
126
+ nprc = mp.cpu_count()
127
+ except NotImplementedError:
128
+ print('WARNING: cpu_count() not avlailable, using only 1 cpu for video loading')
129
+ nprc = 1
130
+
131
+ pool = mp.Pool(processes=nprc)
132
+
133
+ vids = []
134
+ for v in tqdm(pool.imap_unordered(load_video,vid_filelist),total=len(vid_filelist),desc='Loading videos...'):
135
+ vids.append(v)
136
+
137
+
138
+ vids = torch.stack(vids,dim=0).float()
139
+
140
+ return vids
141
+
142
+ def get_stats(stats):
143
+ assert os.path.isfile(stats) and stats.endswith('.npz'), f'no stats found under {stats}'
144
+
145
+ print(f'Using precomputed statistics under {stats}')
146
+ stats = np.load(stats)
147
+ stats = {key: stats[key] for key in stats.files}
148
+
149
+ return stats
150
+
151
+
152
+
153
+
154
+ @torch.no_grad()
155
+ def compute_fvd(ref_input, sample_input, bs=32,
156
+ ref_stats=None,
157
+ sample_stats=None,
158
+ nprc_load=None):
159
+
160
+
161
+
162
+ calc_stats = ref_stats is None or sample_stats is None
163
+
164
+ if calc_stats:
165
+
166
+ only_ref = sample_stats is not None
167
+ only_sample = ref_stats is not None
168
+
169
+
170
+ if isinstance(ref_input,str) and not only_sample:
171
+ ref_input = get_data_from_str(ref_input,nprc_load)
172
+
173
+ if isinstance(sample_input, str) and not only_ref:
174
+ sample_input = get_data_from_str(sample_input, nprc_load)
175
+
176
+ stats = compute_statistics(sample_input,ref_input,
177
+ device='cuda' if torch.cuda.is_available() else 'cpu',
178
+ bs=bs,
179
+ only_ref=only_ref,
180
+ only_sample=only_sample)
181
+
182
+ if only_ref:
183
+ stats.update(get_stats(sample_stats))
184
+ elif only_sample:
185
+ stats.update(get_stats(ref_stats))
186
+
187
+
188
+
189
+ else:
190
+ stats = get_stats(sample_stats)
191
+ stats.update(get_stats(ref_stats))
192
+
193
+ fvd = compute_frechet_distance(**stats)
194
+
195
+ return {'FVD' : fvd,}
196
+
197
+
198
+ @torch.no_grad()
199
+ def compute_statistics(videos_fake, videos_real, device: str='cuda', bs=32, only_ref=False,only_sample=False) -> Dict:
200
+ detector_url = 'https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1'
201
+ detector_kwargs = dict(rescale=True, resize=True, return_features=True) # Return raw features before the softmax layer.
202
+
203
+ with open_url(detector_url, verbose=False) as f:
204
+ detector = torch.jit.load(f).eval().to(device)
205
+
206
+
207
+
208
+ assert not (only_sample and only_ref), 'only_ref and only_sample arguments are mutually exclusive'
209
+
210
+ ref_embed, sample_embed = [], []
211
+
212
+ info = f'Computing I3D activations for FVD score with batch size {bs}'
213
+
214
+ if only_ref:
215
+
216
+ if not isvideo(videos_real):
217
+ # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
218
+ videos_real = torch.from_numpy(videos_real).permute(0, 4, 1, 2, 3).float()
219
+ print(videos_real.shape)
220
+
221
+ if videos_real.shape[0] % bs == 0:
222
+ n_secs = videos_real.shape[0] // bs
223
+ else:
224
+ n_secs = videos_real.shape[0] // bs + 1
225
+
226
+ videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
227
+
228
+ for ref_v in tqdm(videos_real, total=len(videos_real),desc=info):
229
+
230
+ feats_ref = detector(ref_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
231
+ ref_embed.append(feats_ref)
232
+
233
+ elif only_sample:
234
+
235
+ if not isvideo(videos_fake):
236
+ # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
237
+ videos_fake = torch.from_numpy(videos_fake).permute(0, 4, 1, 2, 3).float()
238
+ print(videos_fake.shape)
239
+
240
+ if videos_fake.shape[0] % bs == 0:
241
+ n_secs = videos_fake.shape[0] // bs
242
+ else:
243
+ n_secs = videos_fake.shape[0] // bs + 1
244
+
245
+ videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
246
+
247
+ for sample_v in tqdm(videos_fake, total=len(videos_real),desc=info):
248
+ feats_sample = detector(sample_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
249
+ sample_embed.append(feats_sample)
250
+
251
+
252
+ else:
253
+
254
+ if not isvideo(videos_real):
255
+ # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
256
+ videos_real = torch.from_numpy(videos_real).permute(0, 4, 1, 2, 3).float()
257
+
258
+ if not isvideo(videos_fake):
259
+ videos_fake = torch.from_numpy(videos_fake).permute(0, 4, 1, 2, 3).float()
260
+
261
+ if videos_fake.shape[0] % bs == 0:
262
+ n_secs = videos_fake.shape[0] // bs
263
+ else:
264
+ n_secs = videos_fake.shape[0] // bs + 1
265
+
266
+ videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
267
+ videos_fake = torch.tensor_split(videos_fake, n_secs, dim=0)
268
+
269
+ for ref_v, sample_v in tqdm(zip(videos_real,videos_fake),total=len(videos_fake),desc=info):
270
+ # print(ref_v.shape)
271
+ # ref_v = torch.nn.functional.interpolate(ref_v, size=(sample_v.shape[2], 256, 256), mode='trilinear', align_corners=False)
272
+ # sample_v = torch.nn.functional.interpolate(sample_v, size=(sample_v.shape[2], 256, 256), mode='trilinear', align_corners=False)
273
+
274
+
275
+ feats_sample = detector(sample_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
276
+ feats_ref = detector(ref_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
277
+ sample_embed.append(feats_sample)
278
+ ref_embed.append(feats_ref)
279
+
280
+ out = dict()
281
+ if len(sample_embed) > 0:
282
+ sample_embed = np.concatenate(sample_embed,axis=0)
283
+ mu_sample, sigma_sample = compute_stats(sample_embed)
284
+ out.update({'mu_sample': mu_sample,
285
+ 'sigma_sample': sigma_sample})
286
+
287
+ if len(ref_embed) > 0:
288
+ ref_embed = np.concatenate(ref_embed,axis=0)
289
+ mu_ref, sigma_ref = compute_stats(ref_embed)
290
+ out.update({'mu_ref': mu_ref,
291
+ 'sigma_ref': sigma_ref})
292
+
293
+
294
+ return out
ldm/modules/image_degradation/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2
+ from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
ldm/modules/image_degradation/bsrgan.py ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ # --------------------------------------------
4
+ # Super-Resolution
5
+ # --------------------------------------------
6
+ #
7
+ # Kai Zhang ([email protected])
8
+ # https://github.com/cszn
9
+ # From 2019/03--2021/08
10
+ # --------------------------------------------
11
+ """
12
+
13
+ import numpy as np
14
+ import cv2
15
+ import torch
16
+
17
+ from functools import partial
18
+ import random
19
+ from scipy import ndimage
20
+ import scipy
21
+ import scipy.stats as ss
22
+ from scipy.interpolate import interp2d
23
+ from scipy.linalg import orth
24
+ import albumentations
25
+
26
+ import ldm.modules.image_degradation.utils_image as util
27
+
28
+
29
+ def modcrop_np(img, sf):
30
+ '''
31
+ Args:
32
+ img: numpy image, WxH or WxHxC
33
+ sf: scale factor
34
+ Return:
35
+ cropped image
36
+ '''
37
+ w, h = img.shape[:2]
38
+ im = np.copy(img)
39
+ return im[:w - w % sf, :h - h % sf, ...]
40
+
41
+
42
+ """
43
+ # --------------------------------------------
44
+ # anisotropic Gaussian kernels
45
+ # --------------------------------------------
46
+ """
47
+
48
+
49
+ def analytic_kernel(k):
50
+ """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
51
+ k_size = k.shape[0]
52
+ # Calculate the big kernels size
53
+ big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
54
+ # Loop over the small kernel to fill the big one
55
+ for r in range(k_size):
56
+ for c in range(k_size):
57
+ big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
58
+ # Crop the edges of the big kernel to ignore very small values and increase run time of SR
59
+ crop = k_size // 2
60
+ cropped_big_k = big_k[crop:-crop, crop:-crop]
61
+ # Normalize to 1
62
+ return cropped_big_k / cropped_big_k.sum()
63
+
64
+
65
+ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
66
+ """ generate an anisotropic Gaussian kernel
67
+ Args:
68
+ ksize : e.g., 15, kernel size
69
+ theta : [0, pi], rotation angle range
70
+ l1 : [0.1,50], scaling of eigenvalues
71
+ l2 : [0.1,l1], scaling of eigenvalues
72
+ If l1 = l2, will get an isotropic Gaussian kernel.
73
+ Returns:
74
+ k : kernel
75
+ """
76
+
77
+ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
78
+ V = np.array([[v[0], v[1]], [v[1], -v[0]]])
79
+ D = np.array([[l1, 0], [0, l2]])
80
+ Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
81
+ k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
82
+
83
+ return k
84
+
85
+
86
+ def gm_blur_kernel(mean, cov, size=15):
87
+ center = size / 2.0 + 0.5
88
+ k = np.zeros([size, size])
89
+ for y in range(size):
90
+ for x in range(size):
91
+ cy = y - center + 1
92
+ cx = x - center + 1
93
+ k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
94
+
95
+ k = k / np.sum(k)
96
+ return k
97
+
98
+
99
+ def shift_pixel(x, sf, upper_left=True):
100
+ """shift pixel for super-resolution with different scale factors
101
+ Args:
102
+ x: WxHxC or WxH
103
+ sf: scale factor
104
+ upper_left: shift direction
105
+ """
106
+ h, w = x.shape[:2]
107
+ shift = (sf - 1) * 0.5
108
+ xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
109
+ if upper_left:
110
+ x1 = xv + shift
111
+ y1 = yv + shift
112
+ else:
113
+ x1 = xv - shift
114
+ y1 = yv - shift
115
+
116
+ x1 = np.clip(x1, 0, w - 1)
117
+ y1 = np.clip(y1, 0, h - 1)
118
+
119
+ if x.ndim == 2:
120
+ x = interp2d(xv, yv, x)(x1, y1)
121
+ if x.ndim == 3:
122
+ for i in range(x.shape[-1]):
123
+ x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
124
+
125
+ return x
126
+
127
+
128
+ def blur(x, k):
129
+ '''
130
+ x: image, NxcxHxW
131
+ k: kernel, Nx1xhxw
132
+ '''
133
+ n, c = x.shape[:2]
134
+ p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
135
+ x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
136
+ k = k.repeat(1, c, 1, 1)
137
+ k = k.view(-1, 1, k.shape[2], k.shape[3])
138
+ x = x.view(1, -1, x.shape[2], x.shape[3])
139
+ x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
140
+ x = x.view(n, c, x.shape[2], x.shape[3])
141
+
142
+ return x
143
+
144
+
145
+ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
146
+ """"
147
+ # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
148
+ # Kai Zhang
149
+ # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
150
+ # max_var = 2.5 * sf
151
+ """
152
+ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
153
+ lambda_1 = min_var + np.random.rand() * (max_var - min_var)
154
+ lambda_2 = min_var + np.random.rand() * (max_var - min_var)
155
+ theta = np.random.rand() * np.pi # random theta
156
+ noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
157
+
158
+ # Set COV matrix using Lambdas and Theta
159
+ LAMBDA = np.diag([lambda_1, lambda_2])
160
+ Q = np.array([[np.cos(theta), -np.sin(theta)],
161
+ [np.sin(theta), np.cos(theta)]])
162
+ SIGMA = Q @ LAMBDA @ Q.T
163
+ INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
164
+
165
+ # Set expectation position (shifting kernel for aligned image)
166
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
167
+ MU = MU[None, None, :, None]
168
+
169
+ # Create meshgrid for Gaussian
170
+ [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
171
+ Z = np.stack([X, Y], 2)[:, :, :, None]
172
+
173
+ # Calcualte Gaussian for every pixel of the kernel
174
+ ZZ = Z - MU
175
+ ZZ_t = ZZ.transpose(0, 1, 3, 2)
176
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
177
+
178
+ # shift the kernel so it will be centered
179
+ # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
180
+
181
+ # Normalize the kernel and return
182
+ # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
183
+ kernel = raw_kernel / np.sum(raw_kernel)
184
+ return kernel
185
+
186
+
187
+ def fspecial_gaussian(hsize, sigma):
188
+ hsize = [hsize, hsize]
189
+ siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
190
+ std = sigma
191
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
192
+ arg = -(x * x + y * y) / (2 * std * std)
193
+ h = np.exp(arg)
194
+ h[h < scipy.finfo(float).eps * h.max()] = 0
195
+ sumh = h.sum()
196
+ if sumh != 0:
197
+ h = h / sumh
198
+ return h
199
+
200
+
201
+ def fspecial_laplacian(alpha):
202
+ alpha = max([0, min([alpha, 1])])
203
+ h1 = alpha / (alpha + 1)
204
+ h2 = (1 - alpha) / (alpha + 1)
205
+ h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
206
+ h = np.array(h)
207
+ return h
208
+
209
+
210
+ def fspecial(filter_type, *args, **kwargs):
211
+ '''
212
+ python code from:
213
+ https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
214
+ '''
215
+ if filter_type == 'gaussian':
216
+ return fspecial_gaussian(*args, **kwargs)
217
+ if filter_type == 'laplacian':
218
+ return fspecial_laplacian(*args, **kwargs)
219
+
220
+
221
+ """
222
+ # --------------------------------------------
223
+ # degradation models
224
+ # --------------------------------------------
225
+ """
226
+
227
+
228
+ def bicubic_degradation(x, sf=3):
229
+ '''
230
+ Args:
231
+ x: HxWxC image, [0, 1]
232
+ sf: down-scale factor
233
+ Return:
234
+ bicubicly downsampled LR image
235
+ '''
236
+ x = util.imresize_np(x, scale=1 / sf)
237
+ return x
238
+
239
+
240
+ def srmd_degradation(x, k, sf=3):
241
+ ''' blur + bicubic downsampling
242
+ Args:
243
+ x: HxWxC image, [0, 1]
244
+ k: hxw, double
245
+ sf: down-scale factor
246
+ Return:
247
+ downsampled LR image
248
+ Reference:
249
+ @inproceedings{zhang2018learning,
250
+ title={Learning a single convolutional super-resolution network for multiple degradations},
251
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
252
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
253
+ pages={3262--3271},
254
+ year={2018}
255
+ }
256
+ '''
257
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
258
+ x = bicubic_degradation(x, sf=sf)
259
+ return x
260
+
261
+
262
+ def dpsr_degradation(x, k, sf=3):
263
+ ''' bicubic downsampling + blur
264
+ Args:
265
+ x: HxWxC image, [0, 1]
266
+ k: hxw, double
267
+ sf: down-scale factor
268
+ Return:
269
+ downsampled LR image
270
+ Reference:
271
+ @inproceedings{zhang2019deep,
272
+ title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
273
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
274
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
275
+ pages={1671--1681},
276
+ year={2019}
277
+ }
278
+ '''
279
+ x = bicubic_degradation(x, sf=sf)
280
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
281
+ return x
282
+
283
+
284
+ def classical_degradation(x, k, sf=3):
285
+ ''' blur + downsampling
286
+ Args:
287
+ x: HxWxC image, [0, 1]/[0, 255]
288
+ k: hxw, double
289
+ sf: down-scale factor
290
+ Return:
291
+ downsampled LR image
292
+ '''
293
+ x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
294
+ # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
295
+ st = 0
296
+ return x[st::sf, st::sf, ...]
297
+
298
+
299
+ def add_sharpening(img, weight=0.5, radius=50, threshold=10):
300
+ """USM sharpening. borrowed from real-ESRGAN
301
+ Input image: I; Blurry image: B.
302
+ 1. K = I + weight * (I - B)
303
+ 2. Mask = 1 if abs(I - B) > threshold, else: 0
304
+ 3. Blur mask:
305
+ 4. Out = Mask * K + (1 - Mask) * I
306
+ Args:
307
+ img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
308
+ weight (float): Sharp weight. Default: 1.
309
+ radius (float): Kernel size of Gaussian blur. Default: 50.
310
+ threshold (int):
311
+ """
312
+ if radius % 2 == 0:
313
+ radius += 1
314
+ blur = cv2.GaussianBlur(img, (radius, radius), 0)
315
+ residual = img - blur
316
+ mask = np.abs(residual) * 255 > threshold
317
+ mask = mask.astype('float32')
318
+ soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
319
+
320
+ K = img + weight * residual
321
+ K = np.clip(K, 0, 1)
322
+ return soft_mask * K + (1 - soft_mask) * img
323
+
324
+
325
+ def add_blur(img, sf=4):
326
+ wd2 = 4.0 + sf
327
+ wd = 2.0 + 0.2 * sf
328
+ if random.random() < 0.5:
329
+ l1 = wd2 * random.random()
330
+ l2 = wd2 * random.random()
331
+ k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
332
+ else:
333
+ k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
334
+ img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
335
+
336
+ return img
337
+
338
+
339
+ def add_resize(img, sf=4):
340
+ rnum = np.random.rand()
341
+ if rnum > 0.8: # up
342
+ sf1 = random.uniform(1, 2)
343
+ elif rnum < 0.7: # down
344
+ sf1 = random.uniform(0.5 / sf, 1)
345
+ else:
346
+ sf1 = 1.0
347
+ img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
348
+ img = np.clip(img, 0.0, 1.0)
349
+
350
+ return img
351
+
352
+
353
+ # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
354
+ # noise_level = random.randint(noise_level1, noise_level2)
355
+ # rnum = np.random.rand()
356
+ # if rnum > 0.6: # add color Gaussian noise
357
+ # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
358
+ # elif rnum < 0.4: # add grayscale Gaussian noise
359
+ # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
360
+ # else: # add noise
361
+ # L = noise_level2 / 255.
362
+ # D = np.diag(np.random.rand(3))
363
+ # U = orth(np.random.rand(3, 3))
364
+ # conv = np.dot(np.dot(np.transpose(U), D), U)
365
+ # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
366
+ # img = np.clip(img, 0.0, 1.0)
367
+ # return img
368
+
369
+ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
370
+ noise_level = random.randint(noise_level1, noise_level2)
371
+ rnum = np.random.rand()
372
+ if rnum > 0.6: # add color Gaussian noise
373
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
374
+ elif rnum < 0.4: # add grayscale Gaussian noise
375
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
376
+ else: # add noise
377
+ L = noise_level2 / 255.
378
+ D = np.diag(np.random.rand(3))
379
+ U = orth(np.random.rand(3, 3))
380
+ conv = np.dot(np.dot(np.transpose(U), D), U)
381
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
382
+ img = np.clip(img, 0.0, 1.0)
383
+ return img
384
+
385
+
386
+ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
387
+ noise_level = random.randint(noise_level1, noise_level2)
388
+ img = np.clip(img, 0.0, 1.0)
389
+ rnum = random.random()
390
+ if rnum > 0.6:
391
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
392
+ elif rnum < 0.4:
393
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
394
+ else:
395
+ L = noise_level2 / 255.
396
+ D = np.diag(np.random.rand(3))
397
+ U = orth(np.random.rand(3, 3))
398
+ conv = np.dot(np.dot(np.transpose(U), D), U)
399
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
400
+ img = np.clip(img, 0.0, 1.0)
401
+ return img
402
+
403
+
404
+ def add_Poisson_noise(img):
405
+ img = np.clip((img * 255.0).round(), 0, 255) / 255.
406
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
407
+ if random.random() < 0.5:
408
+ img = np.random.poisson(img * vals).astype(np.float32) / vals
409
+ else:
410
+ img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
411
+ img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
412
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
413
+ img += noise_gray[:, :, np.newaxis]
414
+ img = np.clip(img, 0.0, 1.0)
415
+ return img
416
+
417
+
418
+ def add_JPEG_noise(img):
419
+ quality_factor = random.randint(30, 95)
420
+ img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
421
+ result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
422
+ img = cv2.imdecode(encimg, 1)
423
+ img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
424
+ return img
425
+
426
+
427
+ def random_crop(lq, hq, sf=4, lq_patchsize=64):
428
+ h, w = lq.shape[:2]
429
+ rnd_h = random.randint(0, h - lq_patchsize)
430
+ rnd_w = random.randint(0, w - lq_patchsize)
431
+ lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
432
+
433
+ rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
434
+ hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
435
+ return lq, hq
436
+
437
+
438
+ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
439
+ """
440
+ This is the degradation model of BSRGAN from the paper
441
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
442
+ ----------
443
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
444
+ sf: scale factor
445
+ isp_model: camera ISP model
446
+ Returns
447
+ -------
448
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
449
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
450
+ """
451
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
452
+ sf_ori = sf
453
+
454
+ h1, w1 = img.shape[:2]
455
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
456
+ h, w = img.shape[:2]
457
+
458
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
459
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
460
+
461
+ hq = img.copy()
462
+
463
+ if sf == 4 and random.random() < scale2_prob: # downsample1
464
+ if np.random.rand() < 0.5:
465
+ img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
466
+ interpolation=random.choice([1, 2, 3]))
467
+ else:
468
+ img = util.imresize_np(img, 1 / 2, True)
469
+ img = np.clip(img, 0.0, 1.0)
470
+ sf = 2
471
+
472
+ shuffle_order = random.sample(range(7), 7)
473
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
474
+ if idx1 > idx2: # keep downsample3 last
475
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
476
+
477
+ for i in shuffle_order:
478
+
479
+ if i == 0:
480
+ img = add_blur(img, sf=sf)
481
+
482
+ elif i == 1:
483
+ img = add_blur(img, sf=sf)
484
+
485
+ elif i == 2:
486
+ a, b = img.shape[1], img.shape[0]
487
+ # downsample2
488
+ if random.random() < 0.75:
489
+ sf1 = random.uniform(1, 2 * sf)
490
+ img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
491
+ interpolation=random.choice([1, 2, 3]))
492
+ else:
493
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
494
+ k_shifted = shift_pixel(k, sf)
495
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
496
+ img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
497
+ img = img[0::sf, 0::sf, ...] # nearest downsampling
498
+ img = np.clip(img, 0.0, 1.0)
499
+
500
+ elif i == 3:
501
+ # downsample3
502
+ img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
503
+ img = np.clip(img, 0.0, 1.0)
504
+
505
+ elif i == 4:
506
+ # add Gaussian noise
507
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
508
+
509
+ elif i == 5:
510
+ # add JPEG noise
511
+ if random.random() < jpeg_prob:
512
+ img = add_JPEG_noise(img)
513
+
514
+ elif i == 6:
515
+ # add processed camera sensor noise
516
+ if random.random() < isp_prob and isp_model is not None:
517
+ with torch.no_grad():
518
+ img, hq = isp_model.forward(img.copy(), hq)
519
+
520
+ # add final JPEG compression noise
521
+ img = add_JPEG_noise(img)
522
+
523
+ # random crop
524
+ img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
525
+
526
+ return img, hq
527
+
528
+
529
+ # todo no isp_model?
530
+ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
531
+ """
532
+ This is the degradation model of BSRGAN from the paper
533
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
534
+ ----------
535
+ sf: scale factor
536
+ isp_model: camera ISP model
537
+ Returns
538
+ -------
539
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
540
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
541
+ """
542
+ image = util.uint2single(image)
543
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
544
+ sf_ori = sf
545
+
546
+ h1, w1 = image.shape[:2]
547
+ image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
548
+ h, w = image.shape[:2]
549
+
550
+ hq = image.copy()
551
+
552
+ if sf == 4 and random.random() < scale2_prob: # downsample1
553
+ if np.random.rand() < 0.5:
554
+ image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
555
+ interpolation=random.choice([1, 2, 3]))
556
+ else:
557
+ image = util.imresize_np(image, 1 / 2, True)
558
+ image = np.clip(image, 0.0, 1.0)
559
+ sf = 2
560
+
561
+ shuffle_order = random.sample(range(7), 7)
562
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
563
+ if idx1 > idx2: # keep downsample3 last
564
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
565
+
566
+ for i in shuffle_order:
567
+
568
+ if i == 0:
569
+ image = add_blur(image, sf=sf)
570
+
571
+ elif i == 1:
572
+ image = add_blur(image, sf=sf)
573
+
574
+ elif i == 2:
575
+ a, b = image.shape[1], image.shape[0]
576
+ # downsample2
577
+ if random.random() < 0.75:
578
+ sf1 = random.uniform(1, 2 * sf)
579
+ image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
580
+ interpolation=random.choice([1, 2, 3]))
581
+ else:
582
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
583
+ k_shifted = shift_pixel(k, sf)
584
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
585
+ image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
586
+ image = image[0::sf, 0::sf, ...] # nearest downsampling
587
+ image = np.clip(image, 0.0, 1.0)
588
+
589
+ elif i == 3:
590
+ # downsample3
591
+ image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
592
+ image = np.clip(image, 0.0, 1.0)
593
+
594
+ elif i == 4:
595
+ # add Gaussian noise
596
+ image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
597
+
598
+ elif i == 5:
599
+ # add JPEG noise
600
+ if random.random() < jpeg_prob:
601
+ image = add_JPEG_noise(image)
602
+
603
+ # elif i == 6:
604
+ # # add processed camera sensor noise
605
+ # if random.random() < isp_prob and isp_model is not None:
606
+ # with torch.no_grad():
607
+ # img, hq = isp_model.forward(img.copy(), hq)
608
+
609
+ # add final JPEG compression noise
610
+ image = add_JPEG_noise(image)
611
+ image = util.single2uint(image)
612
+ example = {"image":image}
613
+ return example
614
+
615
+
616
+ # TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
617
+ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
618
+ """
619
+ This is an extended degradation model by combining
620
+ the degradation models of BSRGAN and Real-ESRGAN
621
+ ----------
622
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
623
+ sf: scale factor
624
+ use_shuffle: the degradation shuffle
625
+ use_sharp: sharpening the img
626
+ Returns
627
+ -------
628
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
629
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
630
+ """
631
+
632
+ h1, w1 = img.shape[:2]
633
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
634
+ h, w = img.shape[:2]
635
+
636
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
637
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
638
+
639
+ if use_sharp:
640
+ img = add_sharpening(img)
641
+ hq = img.copy()
642
+
643
+ if random.random() < shuffle_prob:
644
+ shuffle_order = random.sample(range(13), 13)
645
+ else:
646
+ shuffle_order = list(range(13))
647
+ # local shuffle for noise, JPEG is always the last one
648
+ shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
649
+ shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
650
+
651
+ poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
652
+
653
+ for i in shuffle_order:
654
+ if i == 0:
655
+ img = add_blur(img, sf=sf)
656
+ elif i == 1:
657
+ img = add_resize(img, sf=sf)
658
+ elif i == 2:
659
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
660
+ elif i == 3:
661
+ if random.random() < poisson_prob:
662
+ img = add_Poisson_noise(img)
663
+ elif i == 4:
664
+ if random.random() < speckle_prob:
665
+ img = add_speckle_noise(img)
666
+ elif i == 5:
667
+ if random.random() < isp_prob and isp_model is not None:
668
+ with torch.no_grad():
669
+ img, hq = isp_model.forward(img.copy(), hq)
670
+ elif i == 6:
671
+ img = add_JPEG_noise(img)
672
+ elif i == 7:
673
+ img = add_blur(img, sf=sf)
674
+ elif i == 8:
675
+ img = add_resize(img, sf=sf)
676
+ elif i == 9:
677
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
678
+ elif i == 10:
679
+ if random.random() < poisson_prob:
680
+ img = add_Poisson_noise(img)
681
+ elif i == 11:
682
+ if random.random() < speckle_prob:
683
+ img = add_speckle_noise(img)
684
+ elif i == 12:
685
+ if random.random() < isp_prob and isp_model is not None:
686
+ with torch.no_grad():
687
+ img, hq = isp_model.forward(img.copy(), hq)
688
+ else:
689
+ print('check the shuffle!')
690
+
691
+ # resize to desired size
692
+ img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
693
+ interpolation=random.choice([1, 2, 3]))
694
+
695
+ # add final JPEG compression noise
696
+ img = add_JPEG_noise(img)
697
+
698
+ # random crop
699
+ img, hq = random_crop(img, hq, sf, lq_patchsize)
700
+
701
+ return img, hq
702
+
703
+
704
+ if __name__ == '__main__':
705
+ print("hey")
706
+ img = util.imread_uint('utils/test.png', 3)
707
+ print(img)
708
+ img = util.uint2single(img)
709
+ print(img)
710
+ img = img[:448, :448]
711
+ h = img.shape[0] // 4
712
+ print("resizing to", h)
713
+ sf = 4
714
+ deg_fn = partial(degradation_bsrgan_variant, sf=sf)
715
+ for i in range(20):
716
+ print(i)
717
+ img_lq = deg_fn(img)
718
+ print(img_lq)
719
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
720
+ print(img_lq.shape)
721
+ print("bicubic", img_lq_bicubic.shape)
722
+ print(img_hq.shape)
723
+ lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
724
+ interpolation=0)
725
+ lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
726
+ interpolation=0)
727
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
728
+ util.imsave(img_concat, str(i) + '.png')
729
+
730
+
ldm/modules/image_degradation/bsrgan_light.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+
6
+ from functools import partial
7
+ import random
8
+ from scipy import ndimage
9
+ import scipy
10
+ import scipy.stats as ss
11
+ from scipy.interpolate import interp2d
12
+ from scipy.linalg import orth
13
+ import albumentations
14
+
15
+ import ldm.modules.image_degradation.utils_image as util
16
+
17
+ """
18
+ # --------------------------------------------
19
+ # Super-Resolution
20
+ # --------------------------------------------
21
+ #
22
+ # Kai Zhang ([email protected])
23
+ # https://github.com/cszn
24
+ # From 2019/03--2021/08
25
+ # --------------------------------------------
26
+ """
27
+
28
+
29
+ def modcrop_np(img, sf):
30
+ '''
31
+ Args:
32
+ img: numpy image, WxH or WxHxC
33
+ sf: scale factor
34
+ Return:
35
+ cropped image
36
+ '''
37
+ w, h = img.shape[:2]
38
+ im = np.copy(img)
39
+ return im[:w - w % sf, :h - h % sf, ...]
40
+
41
+
42
+ """
43
+ # --------------------------------------------
44
+ # anisotropic Gaussian kernels
45
+ # --------------------------------------------
46
+ """
47
+
48
+
49
+ def analytic_kernel(k):
50
+ """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
51
+ k_size = k.shape[0]
52
+ # Calculate the big kernels size
53
+ big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
54
+ # Loop over the small kernel to fill the big one
55
+ for r in range(k_size):
56
+ for c in range(k_size):
57
+ big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
58
+ # Crop the edges of the big kernel to ignore very small values and increase run time of SR
59
+ crop = k_size // 2
60
+ cropped_big_k = big_k[crop:-crop, crop:-crop]
61
+ # Normalize to 1
62
+ return cropped_big_k / cropped_big_k.sum()
63
+
64
+
65
+ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
66
+ """ generate an anisotropic Gaussian kernel
67
+ Args:
68
+ ksize : e.g., 15, kernel size
69
+ theta : [0, pi], rotation angle range
70
+ l1 : [0.1,50], scaling of eigenvalues
71
+ l2 : [0.1,l1], scaling of eigenvalues
72
+ If l1 = l2, will get an isotropic Gaussian kernel.
73
+ Returns:
74
+ k : kernel
75
+ """
76
+
77
+ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
78
+ V = np.array([[v[0], v[1]], [v[1], -v[0]]])
79
+ D = np.array([[l1, 0], [0, l2]])
80
+ Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
81
+ k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
82
+
83
+ return k
84
+
85
+
86
+ def gm_blur_kernel(mean, cov, size=15):
87
+ center = size / 2.0 + 0.5
88
+ k = np.zeros([size, size])
89
+ for y in range(size):
90
+ for x in range(size):
91
+ cy = y - center + 1
92
+ cx = x - center + 1
93
+ k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
94
+
95
+ k = k / np.sum(k)
96
+ return k
97
+
98
+
99
+ def shift_pixel(x, sf, upper_left=True):
100
+ """shift pixel for super-resolution with different scale factors
101
+ Args:
102
+ x: WxHxC or WxH
103
+ sf: scale factor
104
+ upper_left: shift direction
105
+ """
106
+ h, w = x.shape[:2]
107
+ shift = (sf - 1) * 0.5
108
+ xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
109
+ if upper_left:
110
+ x1 = xv + shift
111
+ y1 = yv + shift
112
+ else:
113
+ x1 = xv - shift
114
+ y1 = yv - shift
115
+
116
+ x1 = np.clip(x1, 0, w - 1)
117
+ y1 = np.clip(y1, 0, h - 1)
118
+
119
+ if x.ndim == 2:
120
+ x = interp2d(xv, yv, x)(x1, y1)
121
+ if x.ndim == 3:
122
+ for i in range(x.shape[-1]):
123
+ x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
124
+
125
+ return x
126
+
127
+
128
+ def blur(x, k):
129
+ '''
130
+ x: image, NxcxHxW
131
+ k: kernel, Nx1xhxw
132
+ '''
133
+ n, c = x.shape[:2]
134
+ p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
135
+ x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
136
+ k = k.repeat(1, c, 1, 1)
137
+ k = k.view(-1, 1, k.shape[2], k.shape[3])
138
+ x = x.view(1, -1, x.shape[2], x.shape[3])
139
+ x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
140
+ x = x.view(n, c, x.shape[2], x.shape[3])
141
+
142
+ return x
143
+
144
+
145
+ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
146
+ """"
147
+ # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
148
+ # Kai Zhang
149
+ # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
150
+ # max_var = 2.5 * sf
151
+ """
152
+ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
153
+ lambda_1 = min_var + np.random.rand() * (max_var - min_var)
154
+ lambda_2 = min_var + np.random.rand() * (max_var - min_var)
155
+ theta = np.random.rand() * np.pi # random theta
156
+ noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
157
+
158
+ # Set COV matrix using Lambdas and Theta
159
+ LAMBDA = np.diag([lambda_1, lambda_2])
160
+ Q = np.array([[np.cos(theta), -np.sin(theta)],
161
+ [np.sin(theta), np.cos(theta)]])
162
+ SIGMA = Q @ LAMBDA @ Q.T
163
+ INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
164
+
165
+ # Set expectation position (shifting kernel for aligned image)
166
+ MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
167
+ MU = MU[None, None, :, None]
168
+
169
+ # Create meshgrid for Gaussian
170
+ [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
171
+ Z = np.stack([X, Y], 2)[:, :, :, None]
172
+
173
+ # Calcualte Gaussian for every pixel of the kernel
174
+ ZZ = Z - MU
175
+ ZZ_t = ZZ.transpose(0, 1, 3, 2)
176
+ raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
177
+
178
+ # shift the kernel so it will be centered
179
+ # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
180
+
181
+ # Normalize the kernel and return
182
+ # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
183
+ kernel = raw_kernel / np.sum(raw_kernel)
184
+ return kernel
185
+
186
+
187
+ def fspecial_gaussian(hsize, sigma):
188
+ hsize = [hsize, hsize]
189
+ siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
190
+ std = sigma
191
+ [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
192
+ arg = -(x * x + y * y) / (2 * std * std)
193
+ h = np.exp(arg)
194
+ h[h < scipy.finfo(float).eps * h.max()] = 0
195
+ sumh = h.sum()
196
+ if sumh != 0:
197
+ h = h / sumh
198
+ return h
199
+
200
+
201
+ def fspecial_laplacian(alpha):
202
+ alpha = max([0, min([alpha, 1])])
203
+ h1 = alpha / (alpha + 1)
204
+ h2 = (1 - alpha) / (alpha + 1)
205
+ h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
206
+ h = np.array(h)
207
+ return h
208
+
209
+
210
+ def fspecial(filter_type, *args, **kwargs):
211
+ '''
212
+ python code from:
213
+ https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
214
+ '''
215
+ if filter_type == 'gaussian':
216
+ return fspecial_gaussian(*args, **kwargs)
217
+ if filter_type == 'laplacian':
218
+ return fspecial_laplacian(*args, **kwargs)
219
+
220
+
221
+ """
222
+ # --------------------------------------------
223
+ # degradation models
224
+ # --------------------------------------------
225
+ """
226
+
227
+
228
+ def bicubic_degradation(x, sf=3):
229
+ '''
230
+ Args:
231
+ x: HxWxC image, [0, 1]
232
+ sf: down-scale factor
233
+ Return:
234
+ bicubicly downsampled LR image
235
+ '''
236
+ x = util.imresize_np(x, scale=1 / sf)
237
+ return x
238
+
239
+
240
+ def srmd_degradation(x, k, sf=3):
241
+ ''' blur + bicubic downsampling
242
+ Args:
243
+ x: HxWxC image, [0, 1]
244
+ k: hxw, double
245
+ sf: down-scale factor
246
+ Return:
247
+ downsampled LR image
248
+ Reference:
249
+ @inproceedings{zhang2018learning,
250
+ title={Learning a single convolutional super-resolution network for multiple degradations},
251
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
252
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
253
+ pages={3262--3271},
254
+ year={2018}
255
+ }
256
+ '''
257
+ x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
258
+ x = bicubic_degradation(x, sf=sf)
259
+ return x
260
+
261
+
262
+ def dpsr_degradation(x, k, sf=3):
263
+ ''' bicubic downsampling + blur
264
+ Args:
265
+ x: HxWxC image, [0, 1]
266
+ k: hxw, double
267
+ sf: down-scale factor
268
+ Return:
269
+ downsampled LR image
270
+ Reference:
271
+ @inproceedings{zhang2019deep,
272
+ title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
273
+ author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
274
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
275
+ pages={1671--1681},
276
+ year={2019}
277
+ }
278
+ '''
279
+ x = bicubic_degradation(x, sf=sf)
280
+ x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
281
+ return x
282
+
283
+
284
+ def classical_degradation(x, k, sf=3):
285
+ ''' blur + downsampling
286
+ Args:
287
+ x: HxWxC image, [0, 1]/[0, 255]
288
+ k: hxw, double
289
+ sf: down-scale factor
290
+ Return:
291
+ downsampled LR image
292
+ '''
293
+ x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
294
+ # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
295
+ st = 0
296
+ return x[st::sf, st::sf, ...]
297
+
298
+
299
+ def add_sharpening(img, weight=0.5, radius=50, threshold=10):
300
+ """USM sharpening. borrowed from real-ESRGAN
301
+ Input image: I; Blurry image: B.
302
+ 1. K = I + weight * (I - B)
303
+ 2. Mask = 1 if abs(I - B) > threshold, else: 0
304
+ 3. Blur mask:
305
+ 4. Out = Mask * K + (1 - Mask) * I
306
+ Args:
307
+ img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
308
+ weight (float): Sharp weight. Default: 1.
309
+ radius (float): Kernel size of Gaussian blur. Default: 50.
310
+ threshold (int):
311
+ """
312
+ if radius % 2 == 0:
313
+ radius += 1
314
+ blur = cv2.GaussianBlur(img, (radius, radius), 0)
315
+ residual = img - blur
316
+ mask = np.abs(residual) * 255 > threshold
317
+ mask = mask.astype('float32')
318
+ soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
319
+
320
+ K = img + weight * residual
321
+ K = np.clip(K, 0, 1)
322
+ return soft_mask * K + (1 - soft_mask) * img
323
+
324
+
325
+ def add_blur(img, sf=4):
326
+ wd2 = 4.0 + sf
327
+ wd = 2.0 + 0.2 * sf
328
+
329
+ wd2 = wd2/4
330
+ wd = wd/4
331
+
332
+ if random.random() < 0.5:
333
+ l1 = wd2 * random.random()
334
+ l2 = wd2 * random.random()
335
+ k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
336
+ else:
337
+ k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
338
+ img = ndimage.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
339
+
340
+ return img
341
+
342
+
343
+ def add_resize(img, sf=4):
344
+ rnum = np.random.rand()
345
+ if rnum > 0.8: # up
346
+ sf1 = random.uniform(1, 2)
347
+ elif rnum < 0.7: # down
348
+ sf1 = random.uniform(0.5 / sf, 1)
349
+ else:
350
+ sf1 = 1.0
351
+ img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
352
+ img = np.clip(img, 0.0, 1.0)
353
+
354
+ return img
355
+
356
+
357
+ # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
358
+ # noise_level = random.randint(noise_level1, noise_level2)
359
+ # rnum = np.random.rand()
360
+ # if rnum > 0.6: # add color Gaussian noise
361
+ # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
362
+ # elif rnum < 0.4: # add grayscale Gaussian noise
363
+ # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
364
+ # else: # add noise
365
+ # L = noise_level2 / 255.
366
+ # D = np.diag(np.random.rand(3))
367
+ # U = orth(np.random.rand(3, 3))
368
+ # conv = np.dot(np.dot(np.transpose(U), D), U)
369
+ # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
370
+ # img = np.clip(img, 0.0, 1.0)
371
+ # return img
372
+
373
+ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
374
+ noise_level = random.randint(noise_level1, noise_level2)
375
+ rnum = np.random.rand()
376
+ if rnum > 0.6: # add color Gaussian noise
377
+ img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
378
+ elif rnum < 0.4: # add grayscale Gaussian noise
379
+ img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
380
+ else: # add noise
381
+ L = noise_level2 / 255.
382
+ D = np.diag(np.random.rand(3))
383
+ U = orth(np.random.rand(3, 3))
384
+ conv = np.dot(np.dot(np.transpose(U), D), U)
385
+ img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
386
+ img = np.clip(img, 0.0, 1.0)
387
+ return img
388
+
389
+
390
+ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
391
+ noise_level = random.randint(noise_level1, noise_level2)
392
+ img = np.clip(img, 0.0, 1.0)
393
+ rnum = random.random()
394
+ if rnum > 0.6:
395
+ img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
396
+ elif rnum < 0.4:
397
+ img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
398
+ else:
399
+ L = noise_level2 / 255.
400
+ D = np.diag(np.random.rand(3))
401
+ U = orth(np.random.rand(3, 3))
402
+ conv = np.dot(np.dot(np.transpose(U), D), U)
403
+ img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
404
+ img = np.clip(img, 0.0, 1.0)
405
+ return img
406
+
407
+
408
+ def add_Poisson_noise(img):
409
+ img = np.clip((img * 255.0).round(), 0, 255) / 255.
410
+ vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
411
+ if random.random() < 0.5:
412
+ img = np.random.poisson(img * vals).astype(np.float32) / vals
413
+ else:
414
+ img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
415
+ img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
416
+ noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
417
+ img += noise_gray[:, :, np.newaxis]
418
+ img = np.clip(img, 0.0, 1.0)
419
+ return img
420
+
421
+
422
+ def add_JPEG_noise(img):
423
+ quality_factor = random.randint(80, 95)
424
+ img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
425
+ result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
426
+ img = cv2.imdecode(encimg, 1)
427
+ img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
428
+ return img
429
+
430
+
431
+ def random_crop(lq, hq, sf=4, lq_patchsize=64):
432
+ h, w = lq.shape[:2]
433
+ rnd_h = random.randint(0, h - lq_patchsize)
434
+ rnd_w = random.randint(0, w - lq_patchsize)
435
+ lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
436
+
437
+ rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
438
+ hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
439
+ return lq, hq
440
+
441
+
442
+ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
443
+ """
444
+ This is the degradation model of BSRGAN from the paper
445
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
446
+ ----------
447
+ img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
448
+ sf: scale factor
449
+ isp_model: camera ISP model
450
+ Returns
451
+ -------
452
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
453
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
454
+ """
455
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
456
+ sf_ori = sf
457
+
458
+ h1, w1 = img.shape[:2]
459
+ img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
460
+ h, w = img.shape[:2]
461
+
462
+ if h < lq_patchsize * sf or w < lq_patchsize * sf:
463
+ raise ValueError(f'img size ({h1}X{w1}) is too small!')
464
+
465
+ hq = img.copy()
466
+
467
+ if sf == 4 and random.random() < scale2_prob: # downsample1
468
+ if np.random.rand() < 0.5:
469
+ img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
470
+ interpolation=random.choice([1, 2, 3]))
471
+ else:
472
+ img = util.imresize_np(img, 1 / 2, True)
473
+ img = np.clip(img, 0.0, 1.0)
474
+ sf = 2
475
+
476
+ shuffle_order = random.sample(range(7), 7)
477
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
478
+ if idx1 > idx2: # keep downsample3 last
479
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
480
+
481
+ for i in shuffle_order:
482
+
483
+ if i == 0:
484
+ img = add_blur(img, sf=sf)
485
+
486
+ elif i == 1:
487
+ img = add_blur(img, sf=sf)
488
+
489
+ elif i == 2:
490
+ a, b = img.shape[1], img.shape[0]
491
+ # downsample2
492
+ if random.random() < 0.75:
493
+ sf1 = random.uniform(1, 2 * sf)
494
+ img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
495
+ interpolation=random.choice([1, 2, 3]))
496
+ else:
497
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
498
+ k_shifted = shift_pixel(k, sf)
499
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
500
+ img = ndimage.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
501
+ img = img[0::sf, 0::sf, ...] # nearest downsampling
502
+ img = np.clip(img, 0.0, 1.0)
503
+
504
+ elif i == 3:
505
+ # downsample3
506
+ img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
507
+ img = np.clip(img, 0.0, 1.0)
508
+
509
+ elif i == 4:
510
+ # add Gaussian noise
511
+ img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
512
+
513
+ elif i == 5:
514
+ # add JPEG noise
515
+ if random.random() < jpeg_prob:
516
+ img = add_JPEG_noise(img)
517
+
518
+ elif i == 6:
519
+ # add processed camera sensor noise
520
+ if random.random() < isp_prob and isp_model is not None:
521
+ with torch.no_grad():
522
+ img, hq = isp_model.forward(img.copy(), hq)
523
+
524
+ # add final JPEG compression noise
525
+ img = add_JPEG_noise(img)
526
+
527
+ # random crop
528
+ img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
529
+
530
+ return img, hq
531
+
532
+
533
+ # todo no isp_model?
534
+ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
535
+ """
536
+ This is the degradation model of BSRGAN from the paper
537
+ "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
538
+ ----------
539
+ sf: scale factor
540
+ isp_model: camera ISP model
541
+ Returns
542
+ -------
543
+ img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
544
+ hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
545
+ """
546
+ image = util.uint2single(image)
547
+ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
548
+ sf_ori = sf
549
+
550
+ h1, w1 = image.shape[:2]
551
+ image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
552
+ h, w = image.shape[:2]
553
+
554
+ hq = image.copy()
555
+
556
+ if sf == 4 and random.random() < scale2_prob: # downsample1
557
+ if np.random.rand() < 0.5:
558
+ image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
559
+ interpolation=random.choice([1, 2, 3]))
560
+ else:
561
+ image = util.imresize_np(image, 1 / 2, True)
562
+ image = np.clip(image, 0.0, 1.0)
563
+ sf = 2
564
+
565
+ shuffle_order = random.sample(range(7), 7)
566
+ idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
567
+ if idx1 > idx2: # keep downsample3 last
568
+ shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
569
+
570
+ for i in shuffle_order:
571
+
572
+ if i == 0:
573
+ image = add_blur(image, sf=sf)
574
+
575
+ # elif i == 1:
576
+ # image = add_blur(image, sf=sf)
577
+
578
+ if i == 0:
579
+ pass
580
+
581
+ elif i == 2:
582
+ a, b = image.shape[1], image.shape[0]
583
+ # downsample2
584
+ if random.random() < 0.8:
585
+ sf1 = random.uniform(1, 2 * sf)
586
+ image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
587
+ interpolation=random.choice([1, 2, 3]))
588
+ else:
589
+ k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
590
+ k_shifted = shift_pixel(k, sf)
591
+ k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
592
+ image = ndimage.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
593
+ image = image[0::sf, 0::sf, ...] # nearest downsampling
594
+
595
+ image = np.clip(image, 0.0, 1.0)
596
+
597
+ elif i == 3:
598
+ # downsample3
599
+ image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
600
+ image = np.clip(image, 0.0, 1.0)
601
+
602
+ elif i == 4:
603
+ # add Gaussian noise
604
+ image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
605
+
606
+ elif i == 5:
607
+ # add JPEG noise
608
+ if random.random() < jpeg_prob:
609
+ image = add_JPEG_noise(image)
610
+ #
611
+ # elif i == 6:
612
+ # # add processed camera sensor noise
613
+ # if random.random() < isp_prob and isp_model is not None:
614
+ # with torch.no_grad():
615
+ # img, hq = isp_model.forward(img.copy(), hq)
616
+
617
+ # add final JPEG compression noise
618
+ image = add_JPEG_noise(image)
619
+ image = util.single2uint(image)
620
+ example = {"image": image}
621
+ return example
622
+
623
+
624
+
625
+
626
+ if __name__ == '__main__':
627
+ print("hey")
628
+ img = util.imread_uint('utils/test.png', 3)
629
+ img = img[:448, :448]
630
+ h = img.shape[0] // 4
631
+ print("resizing to", h)
632
+ sf = 4
633
+ deg_fn = partial(degradation_bsrgan_variant, sf=sf)
634
+ for i in range(20):
635
+ print(i)
636
+ img_hq = img
637
+ img_lq = deg_fn(img)["image"]
638
+ img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
639
+ print(img_lq)
640
+ img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
641
+ print(img_lq.shape)
642
+ print("bicubic", img_lq_bicubic.shape)
643
+ print(img_hq.shape)
644
+ lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
645
+ interpolation=0)
646
+ lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
647
+ (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
648
+ interpolation=0)
649
+ img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
650
+ util.imsave(img_concat, str(i) + '.png')
ldm/modules/image_degradation/utils/test.png ADDED
ldm/modules/image_degradation/utils_image.py ADDED
@@ -0,0 +1,916 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import random
4
+ import numpy as np
5
+ import torch
6
+ import cv2
7
+ from torchvision.utils import make_grid
8
+ from datetime import datetime
9
+ #import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
10
+
11
+
12
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
13
+
14
+
15
+ '''
16
+ # --------------------------------------------
17
+ # Kai Zhang (github: https://github.com/cszn)
18
+ # 03/Mar/2019
19
+ # --------------------------------------------
20
+ # https://github.com/twhui/SRGAN-pyTorch
21
+ # https://github.com/xinntao/BasicSR
22
+ # --------------------------------------------
23
+ '''
24
+
25
+
26
+ IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
27
+
28
+
29
+ def is_image_file(filename):
30
+ return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
31
+
32
+
33
+ def get_timestamp():
34
+ return datetime.now().strftime('%y%m%d-%H%M%S')
35
+
36
+
37
+ def imshow(x, title=None, cbar=False, figsize=None):
38
+ plt.figure(figsize=figsize)
39
+ plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
40
+ if title:
41
+ plt.title(title)
42
+ if cbar:
43
+ plt.colorbar()
44
+ plt.show()
45
+
46
+
47
+ def surf(Z, cmap='rainbow', figsize=None):
48
+ plt.figure(figsize=figsize)
49
+ ax3 = plt.axes(projection='3d')
50
+
51
+ w, h = Z.shape[:2]
52
+ xx = np.arange(0,w,1)
53
+ yy = np.arange(0,h,1)
54
+ X, Y = np.meshgrid(xx, yy)
55
+ ax3.plot_surface(X,Y,Z,cmap=cmap)
56
+ #ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap)
57
+ plt.show()
58
+
59
+
60
+ '''
61
+ # --------------------------------------------
62
+ # get image pathes
63
+ # --------------------------------------------
64
+ '''
65
+
66
+
67
+ def get_image_paths(dataroot):
68
+ paths = None # return None if dataroot is None
69
+ if dataroot is not None:
70
+ paths = sorted(_get_paths_from_images(dataroot))
71
+ return paths
72
+
73
+
74
+ def _get_paths_from_images(path):
75
+ assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
76
+ images = []
77
+ for dirpath, _, fnames in sorted(os.walk(path)):
78
+ for fname in sorted(fnames):
79
+ if is_image_file(fname):
80
+ img_path = os.path.join(dirpath, fname)
81
+ images.append(img_path)
82
+ assert images, '{:s} has no valid image file'.format(path)
83
+ return images
84
+
85
+
86
+ '''
87
+ # --------------------------------------------
88
+ # split large images into small images
89
+ # --------------------------------------------
90
+ '''
91
+
92
+
93
+ def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
94
+ w, h = img.shape[:2]
95
+ patches = []
96
+ if w > p_max and h > p_max:
97
+ w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
98
+ h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
99
+ w1.append(w-p_size)
100
+ h1.append(h-p_size)
101
+ # print(w1)
102
+ # print(h1)
103
+ for i in w1:
104
+ for j in h1:
105
+ patches.append(img[i:i+p_size, j:j+p_size,:])
106
+ else:
107
+ patches.append(img)
108
+
109
+ return patches
110
+
111
+
112
+ def imssave(imgs, img_path):
113
+ """
114
+ imgs: list, N images of size WxHxC
115
+ """
116
+ img_name, ext = os.path.splitext(os.path.basename(img_path))
117
+
118
+ for i, img in enumerate(imgs):
119
+ if img.ndim == 3:
120
+ img = img[:, :, [2, 1, 0]]
121
+ new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
122
+ cv2.imwrite(new_path, img)
123
+
124
+
125
+ def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
126
+ """
127
+ split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
128
+ and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
129
+ will be splitted.
130
+ Args:
131
+ original_dataroot:
132
+ taget_dataroot:
133
+ p_size: size of small images
134
+ p_overlap: patch size in training is a good choice
135
+ p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
136
+ """
137
+ paths = get_image_paths(original_dataroot)
138
+ for img_path in paths:
139
+ # img_name, ext = os.path.splitext(os.path.basename(img_path))
140
+ img = imread_uint(img_path, n_channels=n_channels)
141
+ patches = patches_from_image(img, p_size, p_overlap, p_max)
142
+ imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
143
+ #if original_dataroot == taget_dataroot:
144
+ #del img_path
145
+
146
+ '''
147
+ # --------------------------------------------
148
+ # makedir
149
+ # --------------------------------------------
150
+ '''
151
+
152
+
153
+ def mkdir(path):
154
+ if not os.path.exists(path):
155
+ os.makedirs(path)
156
+
157
+
158
+ def mkdirs(paths):
159
+ if isinstance(paths, str):
160
+ mkdir(paths)
161
+ else:
162
+ for path in paths:
163
+ mkdir(path)
164
+
165
+
166
+ def mkdir_and_rename(path):
167
+ if os.path.exists(path):
168
+ new_name = path + '_archived_' + get_timestamp()
169
+ print('Path already exists. Rename it to [{:s}]'.format(new_name))
170
+ os.rename(path, new_name)
171
+ os.makedirs(path)
172
+
173
+
174
+ '''
175
+ # --------------------------------------------
176
+ # read image from path
177
+ # opencv is fast, but read BGR numpy image
178
+ # --------------------------------------------
179
+ '''
180
+
181
+
182
+ # --------------------------------------------
183
+ # get uint8 image of size HxWxn_channles (RGB)
184
+ # --------------------------------------------
185
+ def imread_uint(path, n_channels=3):
186
+ # input: path
187
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
188
+ if n_channels == 1:
189
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
190
+ img = np.expand_dims(img, axis=2) # HxWx1
191
+ elif n_channels == 3:
192
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
193
+ if img.ndim == 2:
194
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
195
+ else:
196
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
197
+ return img
198
+
199
+
200
+ # --------------------------------------------
201
+ # matlab's imwrite
202
+ # --------------------------------------------
203
+ def imsave(img, img_path):
204
+ img = np.squeeze(img)
205
+ if img.ndim == 3:
206
+ img = img[:, :, [2, 1, 0]]
207
+ cv2.imwrite(img_path, img)
208
+
209
+ def imwrite(img, img_path):
210
+ img = np.squeeze(img)
211
+ if img.ndim == 3:
212
+ img = img[:, :, [2, 1, 0]]
213
+ cv2.imwrite(img_path, img)
214
+
215
+
216
+
217
+ # --------------------------------------------
218
+ # get single image of size HxWxn_channles (BGR)
219
+ # --------------------------------------------
220
+ def read_img(path):
221
+ # read image by cv2
222
+ # return: Numpy float32, HWC, BGR, [0,1]
223
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # cv2.IMREAD_GRAYSCALE
224
+ img = img.astype(np.float32) / 255.
225
+ if img.ndim == 2:
226
+ img = np.expand_dims(img, axis=2)
227
+ # some images have 4 channels
228
+ if img.shape[2] > 3:
229
+ img = img[:, :, :3]
230
+ return img
231
+
232
+
233
+ '''
234
+ # --------------------------------------------
235
+ # image format conversion
236
+ # --------------------------------------------
237
+ # numpy(single) <---> numpy(unit)
238
+ # numpy(single) <---> tensor
239
+ # numpy(unit) <---> tensor
240
+ # --------------------------------------------
241
+ '''
242
+
243
+
244
+ # --------------------------------------------
245
+ # numpy(single) [0, 1] <---> numpy(unit)
246
+ # --------------------------------------------
247
+
248
+
249
+ def uint2single(img):
250
+
251
+ return np.float32(img/255.)
252
+
253
+
254
+ def single2uint(img):
255
+
256
+ return np.uint8((img.clip(0, 1)*255.).round())
257
+
258
+
259
+ def uint162single(img):
260
+
261
+ return np.float32(img/65535.)
262
+
263
+
264
+ def single2uint16(img):
265
+
266
+ return np.uint16((img.clip(0, 1)*65535.).round())
267
+
268
+
269
+ # --------------------------------------------
270
+ # numpy(unit) (HxWxC or HxW) <---> tensor
271
+ # --------------------------------------------
272
+
273
+
274
+ # convert uint to 4-dimensional torch tensor
275
+ def uint2tensor4(img):
276
+ if img.ndim == 2:
277
+ img = np.expand_dims(img, axis=2)
278
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
279
+
280
+
281
+ # convert uint to 3-dimensional torch tensor
282
+ def uint2tensor3(img):
283
+ if img.ndim == 2:
284
+ img = np.expand_dims(img, axis=2)
285
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
286
+
287
+
288
+ # convert 2/3/4-dimensional torch tensor to uint
289
+ def tensor2uint(img):
290
+ img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
291
+ if img.ndim == 3:
292
+ img = np.transpose(img, (1, 2, 0))
293
+ return np.uint8((img*255.0).round())
294
+
295
+
296
+ # --------------------------------------------
297
+ # numpy(single) (HxWxC) <---> tensor
298
+ # --------------------------------------------
299
+
300
+
301
+ # convert single (HxWxC) to 3-dimensional torch tensor
302
+ def single2tensor3(img):
303
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
304
+
305
+
306
+ # convert single (HxWxC) to 4-dimensional torch tensor
307
+ def single2tensor4(img):
308
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
309
+
310
+
311
+ # convert torch tensor to single
312
+ def tensor2single(img):
313
+ img = img.data.squeeze().float().cpu().numpy()
314
+ if img.ndim == 3:
315
+ img = np.transpose(img, (1, 2, 0))
316
+
317
+ return img
318
+
319
+ # convert torch tensor to single
320
+ def tensor2single3(img):
321
+ img = img.data.squeeze().float().cpu().numpy()
322
+ if img.ndim == 3:
323
+ img = np.transpose(img, (1, 2, 0))
324
+ elif img.ndim == 2:
325
+ img = np.expand_dims(img, axis=2)
326
+ return img
327
+
328
+
329
+ def single2tensor5(img):
330
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
331
+
332
+
333
+ def single32tensor5(img):
334
+ return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
335
+
336
+
337
+ def single42tensor4(img):
338
+ return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
339
+
340
+
341
+ # from skimage.io import imread, imsave
342
+ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
343
+ '''
344
+ Converts a torch Tensor into an image Numpy array of BGR channel order
345
+ Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
346
+ Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
347
+ '''
348
+ tensor = tensor.squeeze().float().cpu().clamp_(*min_max) # squeeze first, then clamp
349
+ tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0]) # to range [0,1]
350
+ n_dim = tensor.dim()
351
+ if n_dim == 4:
352
+ n_img = len(tensor)
353
+ img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
354
+ img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
355
+ elif n_dim == 3:
356
+ img_np = tensor.numpy()
357
+ img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
358
+ elif n_dim == 2:
359
+ img_np = tensor.numpy()
360
+ else:
361
+ raise TypeError(
362
+ 'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
363
+ if out_type == np.uint8:
364
+ img_np = (img_np * 255.0).round()
365
+ # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
366
+ return img_np.astype(out_type)
367
+
368
+
369
+ '''
370
+ # --------------------------------------------
371
+ # Augmentation, flipe and/or rotate
372
+ # --------------------------------------------
373
+ # The following two are enough.
374
+ # (1) augmet_img: numpy image of WxHxC or WxH
375
+ # (2) augment_img_tensor4: tensor image 1xCxWxH
376
+ # --------------------------------------------
377
+ '''
378
+
379
+
380
+ def augment_img(img, mode=0):
381
+ '''Kai Zhang (github: https://github.com/cszn)
382
+ '''
383
+ if mode == 0:
384
+ return img
385
+ elif mode == 1:
386
+ return np.flipud(np.rot90(img))
387
+ elif mode == 2:
388
+ return np.flipud(img)
389
+ elif mode == 3:
390
+ return np.rot90(img, k=3)
391
+ elif mode == 4:
392
+ return np.flipud(np.rot90(img, k=2))
393
+ elif mode == 5:
394
+ return np.rot90(img)
395
+ elif mode == 6:
396
+ return np.rot90(img, k=2)
397
+ elif mode == 7:
398
+ return np.flipud(np.rot90(img, k=3))
399
+
400
+
401
+ def augment_img_tensor4(img, mode=0):
402
+ '''Kai Zhang (github: https://github.com/cszn)
403
+ '''
404
+ if mode == 0:
405
+ return img
406
+ elif mode == 1:
407
+ return img.rot90(1, [2, 3]).flip([2])
408
+ elif mode == 2:
409
+ return img.flip([2])
410
+ elif mode == 3:
411
+ return img.rot90(3, [2, 3])
412
+ elif mode == 4:
413
+ return img.rot90(2, [2, 3]).flip([2])
414
+ elif mode == 5:
415
+ return img.rot90(1, [2, 3])
416
+ elif mode == 6:
417
+ return img.rot90(2, [2, 3])
418
+ elif mode == 7:
419
+ return img.rot90(3, [2, 3]).flip([2])
420
+
421
+
422
+ def augment_img_tensor(img, mode=0):
423
+ '''Kai Zhang (github: https://github.com/cszn)
424
+ '''
425
+ img_size = img.size()
426
+ img_np = img.data.cpu().numpy()
427
+ if len(img_size) == 3:
428
+ img_np = np.transpose(img_np, (1, 2, 0))
429
+ elif len(img_size) == 4:
430
+ img_np = np.transpose(img_np, (2, 3, 1, 0))
431
+ img_np = augment_img(img_np, mode=mode)
432
+ img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
433
+ if len(img_size) == 3:
434
+ img_tensor = img_tensor.permute(2, 0, 1)
435
+ elif len(img_size) == 4:
436
+ img_tensor = img_tensor.permute(3, 2, 0, 1)
437
+
438
+ return img_tensor.type_as(img)
439
+
440
+
441
+ def augment_img_np3(img, mode=0):
442
+ if mode == 0:
443
+ return img
444
+ elif mode == 1:
445
+ return img.transpose(1, 0, 2)
446
+ elif mode == 2:
447
+ return img[::-1, :, :]
448
+ elif mode == 3:
449
+ img = img[::-1, :, :]
450
+ img = img.transpose(1, 0, 2)
451
+ return img
452
+ elif mode == 4:
453
+ return img[:, ::-1, :]
454
+ elif mode == 5:
455
+ img = img[:, ::-1, :]
456
+ img = img.transpose(1, 0, 2)
457
+ return img
458
+ elif mode == 6:
459
+ img = img[:, ::-1, :]
460
+ img = img[::-1, :, :]
461
+ return img
462
+ elif mode == 7:
463
+ img = img[:, ::-1, :]
464
+ img = img[::-1, :, :]
465
+ img = img.transpose(1, 0, 2)
466
+ return img
467
+
468
+
469
+ def augment_imgs(img_list, hflip=True, rot=True):
470
+ # horizontal flip OR rotate
471
+ hflip = hflip and random.random() < 0.5
472
+ vflip = rot and random.random() < 0.5
473
+ rot90 = rot and random.random() < 0.5
474
+
475
+ def _augment(img):
476
+ if hflip:
477
+ img = img[:, ::-1, :]
478
+ if vflip:
479
+ img = img[::-1, :, :]
480
+ if rot90:
481
+ img = img.transpose(1, 0, 2)
482
+ return img
483
+
484
+ return [_augment(img) for img in img_list]
485
+
486
+
487
+ '''
488
+ # --------------------------------------------
489
+ # modcrop and shave
490
+ # --------------------------------------------
491
+ '''
492
+
493
+
494
+ def modcrop(img_in, scale):
495
+ # img_in: Numpy, HWC or HW
496
+ img = np.copy(img_in)
497
+ if img.ndim == 2:
498
+ H, W = img.shape
499
+ H_r, W_r = H % scale, W % scale
500
+ img = img[:H - H_r, :W - W_r]
501
+ elif img.ndim == 3:
502
+ H, W, C = img.shape
503
+ H_r, W_r = H % scale, W % scale
504
+ img = img[:H - H_r, :W - W_r, :]
505
+ else:
506
+ raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
507
+ return img
508
+
509
+
510
+ def shave(img_in, border=0):
511
+ # img_in: Numpy, HWC or HW
512
+ img = np.copy(img_in)
513
+ h, w = img.shape[:2]
514
+ img = img[border:h-border, border:w-border]
515
+ return img
516
+
517
+
518
+ '''
519
+ # --------------------------------------------
520
+ # image processing process on numpy image
521
+ # channel_convert(in_c, tar_type, img_list):
522
+ # rgb2ycbcr(img, only_y=True):
523
+ # bgr2ycbcr(img, only_y=True):
524
+ # ycbcr2rgb(img):
525
+ # --------------------------------------------
526
+ '''
527
+
528
+
529
+ def rgb2ycbcr(img, only_y=True):
530
+ '''same as matlab rgb2ycbcr
531
+ only_y: only return Y channel
532
+ Input:
533
+ uint8, [0, 255]
534
+ float, [0, 1]
535
+ '''
536
+ in_img_type = img.dtype
537
+ img.astype(np.float32)
538
+ if in_img_type != np.uint8:
539
+ img *= 255.
540
+ # convert
541
+ if only_y:
542
+ rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
543
+ else:
544
+ rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
545
+ [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
546
+ if in_img_type == np.uint8:
547
+ rlt = rlt.round()
548
+ else:
549
+ rlt /= 255.
550
+ return rlt.astype(in_img_type)
551
+
552
+
553
+ def ycbcr2rgb(img):
554
+ '''same as matlab ycbcr2rgb
555
+ Input:
556
+ uint8, [0, 255]
557
+ float, [0, 1]
558
+ '''
559
+ in_img_type = img.dtype
560
+ img.astype(np.float32)
561
+ if in_img_type != np.uint8:
562
+ img *= 255.
563
+ # convert
564
+ rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
565
+ [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
566
+ if in_img_type == np.uint8:
567
+ rlt = rlt.round()
568
+ else:
569
+ rlt /= 255.
570
+ return rlt.astype(in_img_type)
571
+
572
+
573
+ def bgr2ycbcr(img, only_y=True):
574
+ '''bgr version of rgb2ycbcr
575
+ only_y: only return Y channel
576
+ Input:
577
+ uint8, [0, 255]
578
+ float, [0, 1]
579
+ '''
580
+ in_img_type = img.dtype
581
+ img.astype(np.float32)
582
+ if in_img_type != np.uint8:
583
+ img *= 255.
584
+ # convert
585
+ if only_y:
586
+ rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
587
+ else:
588
+ rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
589
+ [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
590
+ if in_img_type == np.uint8:
591
+ rlt = rlt.round()
592
+ else:
593
+ rlt /= 255.
594
+ return rlt.astype(in_img_type)
595
+
596
+
597
+ def channel_convert(in_c, tar_type, img_list):
598
+ # conversion among BGR, gray and y
599
+ if in_c == 3 and tar_type == 'gray': # BGR to gray
600
+ gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
601
+ return [np.expand_dims(img, axis=2) for img in gray_list]
602
+ elif in_c == 3 and tar_type == 'y': # BGR to y
603
+ y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
604
+ return [np.expand_dims(img, axis=2) for img in y_list]
605
+ elif in_c == 1 and tar_type == 'RGB': # gray/y to BGR
606
+ return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
607
+ else:
608
+ return img_list
609
+
610
+
611
+ '''
612
+ # --------------------------------------------
613
+ # metric, PSNR and SSIM
614
+ # --------------------------------------------
615
+ '''
616
+
617
+
618
+ # --------------------------------------------
619
+ # PSNR
620
+ # --------------------------------------------
621
+ def calculate_psnr(img1, img2, border=0):
622
+ # img1 and img2 have range [0, 255]
623
+ #img1 = img1.squeeze()
624
+ #img2 = img2.squeeze()
625
+ if not img1.shape == img2.shape:
626
+ raise ValueError('Input images must have the same dimensions.')
627
+ h, w = img1.shape[:2]
628
+ img1 = img1[border:h-border, border:w-border]
629
+ img2 = img2[border:h-border, border:w-border]
630
+
631
+ img1 = img1.astype(np.float64)
632
+ img2 = img2.astype(np.float64)
633
+ mse = np.mean((img1 - img2)**2)
634
+ if mse == 0:
635
+ return float('inf')
636
+ return 20 * math.log10(255.0 / math.sqrt(mse))
637
+
638
+
639
+ # --------------------------------------------
640
+ # SSIM
641
+ # --------------------------------------------
642
+ def calculate_ssim(img1, img2, border=0):
643
+ '''calculate SSIM
644
+ the same outputs as MATLAB's
645
+ img1, img2: [0, 255]
646
+ '''
647
+ #img1 = img1.squeeze()
648
+ #img2 = img2.squeeze()
649
+ if not img1.shape == img2.shape:
650
+ raise ValueError('Input images must have the same dimensions.')
651
+ h, w = img1.shape[:2]
652
+ img1 = img1[border:h-border, border:w-border]
653
+ img2 = img2[border:h-border, border:w-border]
654
+
655
+ if img1.ndim == 2:
656
+ return ssim(img1, img2)
657
+ elif img1.ndim == 3:
658
+ if img1.shape[2] == 3:
659
+ ssims = []
660
+ for i in range(3):
661
+ ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
662
+ return np.array(ssims).mean()
663
+ elif img1.shape[2] == 1:
664
+ return ssim(np.squeeze(img1), np.squeeze(img2))
665
+ else:
666
+ raise ValueError('Wrong input image dimensions.')
667
+
668
+
669
+ def ssim(img1, img2):
670
+ C1 = (0.01 * 255)**2
671
+ C2 = (0.03 * 255)**2
672
+
673
+ img1 = img1.astype(np.float64)
674
+ img2 = img2.astype(np.float64)
675
+ kernel = cv2.getGaussianKernel(11, 1.5)
676
+ window = np.outer(kernel, kernel.transpose())
677
+
678
+ mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid
679
+ mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
680
+ mu1_sq = mu1**2
681
+ mu2_sq = mu2**2
682
+ mu1_mu2 = mu1 * mu2
683
+ sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
684
+ sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
685
+ sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
686
+
687
+ ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
688
+ (sigma1_sq + sigma2_sq + C2))
689
+ return ssim_map.mean()
690
+
691
+
692
+ '''
693
+ # --------------------------------------------
694
+ # matlab's bicubic imresize (numpy and torch) [0, 1]
695
+ # --------------------------------------------
696
+ '''
697
+
698
+
699
+ # matlab 'imresize' function, now only support 'bicubic'
700
+ def cubic(x):
701
+ absx = torch.abs(x)
702
+ absx2 = absx**2
703
+ absx3 = absx**3
704
+ return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
705
+ (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
706
+
707
+
708
+ def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
709
+ if (scale < 1) and (antialiasing):
710
+ # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
711
+ kernel_width = kernel_width / scale
712
+
713
+ # Output-space coordinates
714
+ x = torch.linspace(1, out_length, out_length)
715
+
716
+ # Input-space coordinates. Calculate the inverse mapping such that 0.5
717
+ # in output space maps to 0.5 in input space, and 0.5+scale in output
718
+ # space maps to 1.5 in input space.
719
+ u = x / scale + 0.5 * (1 - 1 / scale)
720
+
721
+ # What is the left-most pixel that can be involved in the computation?
722
+ left = torch.floor(u - kernel_width / 2)
723
+
724
+ # What is the maximum number of pixels that can be involved in the
725
+ # computation? Note: it's OK to use an extra pixel here; if the
726
+ # corresponding weights are all zero, it will be eliminated at the end
727
+ # of this function.
728
+ P = math.ceil(kernel_width) + 2
729
+
730
+ # The indices of the input pixels involved in computing the k-th output
731
+ # pixel are in row k of the indices matrix.
732
+ indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
733
+ 1, P).expand(out_length, P)
734
+
735
+ # The weights used to compute the k-th output pixel are in row k of the
736
+ # weights matrix.
737
+ distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
738
+ # apply cubic kernel
739
+ if (scale < 1) and (antialiasing):
740
+ weights = scale * cubic(distance_to_center * scale)
741
+ else:
742
+ weights = cubic(distance_to_center)
743
+ # Normalize the weights matrix so that each row sums to 1.
744
+ weights_sum = torch.sum(weights, 1).view(out_length, 1)
745
+ weights = weights / weights_sum.expand(out_length, P)
746
+
747
+ # If a column in weights is all zero, get rid of it. only consider the first and last column.
748
+ weights_zero_tmp = torch.sum((weights == 0), 0)
749
+ if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
750
+ indices = indices.narrow(1, 1, P - 2)
751
+ weights = weights.narrow(1, 1, P - 2)
752
+ if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
753
+ indices = indices.narrow(1, 0, P - 2)
754
+ weights = weights.narrow(1, 0, P - 2)
755
+ weights = weights.contiguous()
756
+ indices = indices.contiguous()
757
+ sym_len_s = -indices.min() + 1
758
+ sym_len_e = indices.max() - in_length
759
+ indices = indices + sym_len_s - 1
760
+ return weights, indices, int(sym_len_s), int(sym_len_e)
761
+
762
+
763
+ # --------------------------------------------
764
+ # imresize for tensor image [0, 1]
765
+ # --------------------------------------------
766
+ def imresize(img, scale, antialiasing=True):
767
+ # Now the scale should be the same for H and W
768
+ # input: img: pytorch tensor, CHW or HW [0,1]
769
+ # output: CHW or HW [0,1] w/o round
770
+ need_squeeze = True if img.dim() == 2 else False
771
+ if need_squeeze:
772
+ img.unsqueeze_(0)
773
+ in_C, in_H, in_W = img.size()
774
+ out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
775
+ kernel_width = 4
776
+ kernel = 'cubic'
777
+
778
+ # Return the desired dimension order for performing the resize. The
779
+ # strategy is to perform the resize first along the dimension with the
780
+ # smallest scale factor.
781
+ # Now we do not support this.
782
+
783
+ # get weights and indices
784
+ weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
785
+ in_H, out_H, scale, kernel, kernel_width, antialiasing)
786
+ weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
787
+ in_W, out_W, scale, kernel, kernel_width, antialiasing)
788
+ # process H dimension
789
+ # symmetric copying
790
+ img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
791
+ img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
792
+
793
+ sym_patch = img[:, :sym_len_Hs, :]
794
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
795
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
796
+ img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
797
+
798
+ sym_patch = img[:, -sym_len_He:, :]
799
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
800
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
801
+ img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
802
+
803
+ out_1 = torch.FloatTensor(in_C, out_H, in_W)
804
+ kernel_width = weights_H.size(1)
805
+ for i in range(out_H):
806
+ idx = int(indices_H[i][0])
807
+ for j in range(out_C):
808
+ out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
809
+
810
+ # process W dimension
811
+ # symmetric copying
812
+ out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
813
+ out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
814
+
815
+ sym_patch = out_1[:, :, :sym_len_Ws]
816
+ inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
817
+ sym_patch_inv = sym_patch.index_select(2, inv_idx)
818
+ out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
819
+
820
+ sym_patch = out_1[:, :, -sym_len_We:]
821
+ inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
822
+ sym_patch_inv = sym_patch.index_select(2, inv_idx)
823
+ out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
824
+
825
+ out_2 = torch.FloatTensor(in_C, out_H, out_W)
826
+ kernel_width = weights_W.size(1)
827
+ for i in range(out_W):
828
+ idx = int(indices_W[i][0])
829
+ for j in range(out_C):
830
+ out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
831
+ if need_squeeze:
832
+ out_2.squeeze_()
833
+ return out_2
834
+
835
+
836
+ # --------------------------------------------
837
+ # imresize for numpy image [0, 1]
838
+ # --------------------------------------------
839
+ def imresize_np(img, scale, antialiasing=True):
840
+ # Now the scale should be the same for H and W
841
+ # input: img: Numpy, HWC or HW [0,1]
842
+ # output: HWC or HW [0,1] w/o round
843
+ img = torch.from_numpy(img)
844
+ need_squeeze = True if img.dim() == 2 else False
845
+ if need_squeeze:
846
+ img.unsqueeze_(2)
847
+
848
+ in_H, in_W, in_C = img.size()
849
+ out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
850
+ kernel_width = 4
851
+ kernel = 'cubic'
852
+
853
+ # Return the desired dimension order for performing the resize. The
854
+ # strategy is to perform the resize first along the dimension with the
855
+ # smallest scale factor.
856
+ # Now we do not support this.
857
+
858
+ # get weights and indices
859
+ weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
860
+ in_H, out_H, scale, kernel, kernel_width, antialiasing)
861
+ weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
862
+ in_W, out_W, scale, kernel, kernel_width, antialiasing)
863
+ # process H dimension
864
+ # symmetric copying
865
+ img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
866
+ img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
867
+
868
+ sym_patch = img[:sym_len_Hs, :, :]
869
+ inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
870
+ sym_patch_inv = sym_patch.index_select(0, inv_idx)
871
+ img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
872
+
873
+ sym_patch = img[-sym_len_He:, :, :]
874
+ inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
875
+ sym_patch_inv = sym_patch.index_select(0, inv_idx)
876
+ img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
877
+
878
+ out_1 = torch.FloatTensor(out_H, in_W, in_C)
879
+ kernel_width = weights_H.size(1)
880
+ for i in range(out_H):
881
+ idx = int(indices_H[i][0])
882
+ for j in range(out_C):
883
+ out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
884
+
885
+ # process W dimension
886
+ # symmetric copying
887
+ out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
888
+ out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
889
+
890
+ sym_patch = out_1[:, :sym_len_Ws, :]
891
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
892
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
893
+ out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
894
+
895
+ sym_patch = out_1[:, -sym_len_We:, :]
896
+ inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
897
+ sym_patch_inv = sym_patch.index_select(1, inv_idx)
898
+ out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
899
+
900
+ out_2 = torch.FloatTensor(out_H, out_W, in_C)
901
+ kernel_width = weights_W.size(1)
902
+ for i in range(out_W):
903
+ idx = int(indices_W[i][0])
904
+ for j in range(out_C):
905
+ out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
906
+ if need_squeeze:
907
+ out_2.squeeze_()
908
+
909
+ return out_2.numpy()
910
+
911
+
912
+ if __name__ == '__main__':
913
+ print('---')
914
+ # img = imread_uint('test.bmp', 3)
915
+ # img = uint2single(img)
916
+ # img_bicubic = imresize_np(img, 1/4)
ldm/modules/losses/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
ldm/modules/losses/contperceptual.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/no?
5
+
6
+
7
+ class LPIPSWithDiscriminator(nn.Module):
8
+ def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
9
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
10
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
11
+ disc_loss="hinge"):
12
+
13
+ super().__init__()
14
+ assert disc_loss in ["hinge", "vanilla"]
15
+ self.kl_weight = kl_weight
16
+ self.pixel_weight = pixelloss_weight
17
+ self.perceptual_loss = LPIPS().eval()
18
+ self.perceptual_weight = perceptual_weight
19
+ # output log variance
20
+ self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
21
+
22
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
23
+ n_layers=disc_num_layers,
24
+ use_actnorm=use_actnorm
25
+ ).apply(weights_init)
26
+ self.discriminator_iter_start = disc_start
27
+ self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
28
+ self.disc_factor = disc_factor
29
+ self.discriminator_weight = disc_weight
30
+ self.disc_conditional = disc_conditional
31
+
32
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
33
+ if last_layer is not None:
34
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
35
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
36
+ else:
37
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
38
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
39
+
40
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
41
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
42
+ d_weight = d_weight * self.discriminator_weight
43
+ return d_weight
44
+
45
+ def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
46
+ global_step, last_layer=None, cond=None, split="train",
47
+ weights=None):
48
+ rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
49
+ if self.perceptual_weight > 0:
50
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
51
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
52
+
53
+ nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
54
+ weighted_nll_loss = nll_loss
55
+ if weights is not None:
56
+ weighted_nll_loss = weights*nll_loss
57
+ weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
58
+ nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
59
+ kl_loss = posteriors.kl()
60
+ kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
61
+
62
+ # now the GAN part
63
+ if optimizer_idx == 0:
64
+ # generator update
65
+ if cond is None:
66
+ assert not self.disc_conditional
67
+ logits_fake = self.discriminator(reconstructions.contiguous())
68
+ else:
69
+ assert self.disc_conditional
70
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
71
+ g_loss = -torch.mean(logits_fake)
72
+
73
+ if self.disc_factor > 0.0:
74
+ try:
75
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
76
+ except RuntimeError:
77
+ assert not self.training
78
+ d_weight = torch.tensor(0.0)
79
+ else:
80
+ d_weight = torch.tensor(0.0)
81
+
82
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
83
+ loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
84
+
85
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
86
+ "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
87
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
88
+ "{}/d_weight".format(split): d_weight.detach(),
89
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
90
+ "{}/g_loss".format(split): g_loss.detach().mean(),
91
+ }
92
+ return loss, log
93
+
94
+ if optimizer_idx == 1:
95
+ # second pass for discriminator update
96
+ if cond is None:
97
+ logits_real = self.discriminator(inputs.contiguous().detach())
98
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
99
+ else:
100
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
101
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
102
+
103
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
104
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
105
+
106
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
107
+ "{}/logits_real".format(split): logits_real.detach().mean(),
108
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
109
+ }
110
+ return d_loss, log
111
+
ldm/modules/losses/vqperceptual.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+ from einops import repeat
5
+
6
+ from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
7
+ from taming.modules.losses.lpips import LPIPS
8
+ from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
9
+
10
+
11
+ def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
12
+ assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
13
+ loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
14
+ loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
15
+ loss_real = (weights * loss_real).sum() / weights.sum()
16
+ loss_fake = (weights * loss_fake).sum() / weights.sum()
17
+ d_loss = 0.5 * (loss_real + loss_fake)
18
+ return d_loss
19
+
20
+ def adopt_weight(weight, global_step, threshold=0, value=0.):
21
+ if global_step < threshold:
22
+ weight = value
23
+ return weight
24
+
25
+
26
+ def measure_perplexity(predicted_indices, n_embed):
27
+ # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
28
+ # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
29
+ encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
30
+ avg_probs = encodings.mean(0)
31
+ perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
32
+ cluster_use = torch.sum(avg_probs > 0)
33
+ return perplexity, cluster_use
34
+
35
+ def l1(x, y):
36
+ return torch.abs(x-y)
37
+
38
+
39
+ def l2(x, y):
40
+ return torch.pow((x-y), 2)
41
+
42
+
43
+ class VQLPIPSWithDiscriminator(nn.Module):
44
+ def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
45
+ disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
46
+ perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
47
+ disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
48
+ pixel_loss="l1"):
49
+ super().__init__()
50
+ assert disc_loss in ["hinge", "vanilla"]
51
+ assert perceptual_loss in ["lpips", "clips", "dists"]
52
+ assert pixel_loss in ["l1", "l2"]
53
+ self.codebook_weight = codebook_weight
54
+ self.pixel_weight = pixelloss_weight
55
+ if perceptual_loss == "lpips":
56
+ print(f"{self.__class__.__name__}: Running with LPIPS.")
57
+ self.perceptual_loss = LPIPS().eval()
58
+ else:
59
+ raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
60
+ self.perceptual_weight = perceptual_weight
61
+
62
+ if pixel_loss == "l1":
63
+ self.pixel_loss = l1
64
+ else:
65
+ self.pixel_loss = l2
66
+
67
+ self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
68
+ n_layers=disc_num_layers,
69
+ use_actnorm=use_actnorm,
70
+ ndf=disc_ndf
71
+ ).apply(weights_init)
72
+ self.discriminator_iter_start = disc_start
73
+ if disc_loss == "hinge":
74
+ self.disc_loss = hinge_d_loss
75
+ elif disc_loss == "vanilla":
76
+ self.disc_loss = vanilla_d_loss
77
+ else:
78
+ raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
79
+ print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
80
+ self.disc_factor = disc_factor
81
+ self.discriminator_weight = disc_weight
82
+ self.disc_conditional = disc_conditional
83
+ self.n_classes = n_classes
84
+
85
+ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
86
+ if last_layer is not None:
87
+ nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
88
+ g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
89
+ else:
90
+ nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
91
+ g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
92
+
93
+ d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
94
+ d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
95
+ d_weight = d_weight * self.discriminator_weight
96
+ return d_weight
97
+
98
+ def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
99
+ global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
100
+ if not exists(codebook_loss):
101
+ codebook_loss = torch.tensor([0.]).to(inputs.device)
102
+ #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
103
+ rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
104
+ if self.perceptual_weight > 0:
105
+ p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
106
+ rec_loss = rec_loss + self.perceptual_weight * p_loss
107
+ else:
108
+ p_loss = torch.tensor([0.0])
109
+
110
+ nll_loss = rec_loss
111
+ #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
112
+ nll_loss = torch.mean(nll_loss)
113
+
114
+ # now the GAN part
115
+ if optimizer_idx == 0:
116
+ # generator update
117
+ if cond is None:
118
+ assert not self.disc_conditional
119
+ logits_fake = self.discriminator(reconstructions.contiguous())
120
+ else:
121
+ assert self.disc_conditional
122
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
123
+ g_loss = -torch.mean(logits_fake)
124
+
125
+ try:
126
+ d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
127
+ except RuntimeError:
128
+ assert not self.training
129
+ d_weight = torch.tensor(0.0)
130
+
131
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
132
+ loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
133
+
134
+ log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
135
+ "{}/quant_loss".format(split): codebook_loss.detach().mean(),
136
+ "{}/nll_loss".format(split): nll_loss.detach().mean(),
137
+ "{}/rec_loss".format(split): rec_loss.detach().mean(),
138
+ "{}/p_loss".format(split): p_loss.detach().mean(),
139
+ "{}/d_weight".format(split): d_weight.detach(),
140
+ "{}/disc_factor".format(split): torch.tensor(disc_factor),
141
+ "{}/g_loss".format(split): g_loss.detach().mean(),
142
+ }
143
+ if predicted_indices is not None:
144
+ assert self.n_classes is not None
145
+ with torch.no_grad():
146
+ perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
147
+ log[f"{split}/perplexity"] = perplexity
148
+ log[f"{split}/cluster_usage"] = cluster_usage
149
+ return loss, log
150
+
151
+ if optimizer_idx == 1:
152
+ # second pass for discriminator update
153
+ if cond is None:
154
+ logits_real = self.discriminator(inputs.contiguous().detach())
155
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
156
+ else:
157
+ logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
158
+ logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
159
+
160
+ disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
161
+ d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
162
+
163
+ log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
164
+ "{}/logits_real".format(split): logits_real.detach().mean(),
165
+ "{}/logits_fake".format(split): logits_fake.detach().mean()
166
+ }
167
+ return d_loss, log