Spaces:

AIGC-Audio
/

Make-An-Audio-3

Running on Zero

File size: 2,637 Bytes

a84a65c

import torch
import torch.nn as nn


class VGGishish(nn.Module):

    def __init__(self, conv_layers, use_bn, num_classes):
        '''
        Mostly from
            https://pytorch.org/vision/0.8/_modules/torchvision/models/vgg.html
        '''
        super().__init__()
        layers = []
        in_channels = 1

        # a list of channels with 'MP' (maxpool) from config
        for v in conv_layers:
            if v == 'MP':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1, stride=1)
                if use_bn:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        self.features = nn.Sequential(*layers)

        self.avgpool = nn.AdaptiveAvgPool2d((5, 10))

        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(512 * 5 * 10, 4096),
            nn.ReLU(True),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Linear(4096, num_classes)
        )

        # weight init
        self.reset_parameters()

    def forward(self, x):
        # adding channel dim for conv2d (B, 1, F, T) <-
        x = x.unsqueeze(1)
        # backbone (B, 1, 5, 53) <- (B, 1, 80, 860)
        x = self.features(x)
        # adaptive avg pooling (B, 1, 5, 10) <- (B, 1, 5, 53) – if no MP is used as the end of VGG
        x = self.avgpool(x)
        # flatten
        x = self.flatten(x)
        # classify
        x = self.classifier(x)
        return x

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


if __name__ == '__main__':
    num_classes = 309
    inputs = torch.rand(3, 80, 848)
    conv_layers = [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
    # conv_layers = [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP']
    model = VGGishish(conv_layers, use_bn=False, num_classes=num_classes)
    outputs = model(inputs)
    print(outputs.shape)