diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..144cb69b1c6da603b1fe06698fac665485699636 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sf2 filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
index a699bc5b3c2e987102ca93e0ee28d601e0a93d02..418f623e612ce5f8c695c82b7cfd4131318e34d2 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,741 @@
 import gradio as gr
+from pathlib import Path
 
-def greet(name):
-    return "Hello " + name + "!!"
+import torch
+import shutil
+import os
+import subprocess
+import cv2
+import math
+import clip
+import numpy as np
+from PIL import Image
+from scenedetect import open_video, SceneManager, split_video_ffmpeg
+from scenedetect.detectors import ContentDetector, AdaptiveDetector
+from scenedetect.video_splitter import split_video_ffmpeg
+from scenedetect.scene_manager import save_images
+from utilities.constants import *
+from utilities.chord_to_midi import *
+
+from model.video_music_transformer import VideoMusicTransformer
+from model.video_regression import VideoRegression
+
+import json
+from midi2audio import FluidSynth
+import moviepy.editor as mp
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+import random
+from moviepy.editor import *
+import time
+
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+
+from gradio import Markdown
+
+all_key_names = ['C major', 'G major', 'D major', 'A major',
+                 'E major', 'B major', 'F major', 'Bb major',
+                 'Eb major', 'Ab major', 'Db major', 'Gb major',
+                 'A minor', 'E minor', 'B minor', 'F# minor',
+                 'C# minor', 'G# minor', 'D minor', 'G minor',
+                 'C minor', 'F minor', 'Bb minor', 'Eb minor',
+                 ]
+
+traspose_key_dic = {
+    'F major' : -7,
+    'Gb major' : -6,
+    'G major' : -5,
+    'Ab major' : -4,
+    'A major' : -3,
+    'Bb major' : -2,
+    'B major' : -1,
+    'C major' : 0,
+    'Db major' : 1,
+    'D major' : 2,
+    'Eb major' : 3,
+    'E major' : 4,
+    'D minor' : -7,
+    'Eb minor' : -6,
+    'E minor' : -5,
+    'F minor' : -4,
+    'F# minor' : -3,
+    'G minor' : -2,
+    'G# minor' : -1,
+    'A minor' : 0,
+    'Bb minor' : 1,
+    'B minor' : 2,
+    'C minor' : 3,
+    'C# minor' : 4
+}
+
+flatsharpDic = {
+    'Db':'C#', 
+    'Eb':'D#', 
+    'Gb':'F#', 
+    'Ab':'G#', 
+    'Bb':'A#'
+}
+
+max_conseq_N = 0
+max_conseq_chord = 2
+tempo = 120
+duration = 2
+
+min_loudness = 0  # Minimum loudness level in the input range
+max_loudness = 50  # Maximum loudness level in the input range
+min_velocity = 49  # Minimum velocity value in the output range
+max_velocity = 112  # Maximum velocity value in the output range
+
+
+def split_video_into_frames(video, frame_dir):
+    output_path = os.path.join(frame_dir, f"%03d.jpg")
+    cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"        
+    subprocess.call(cmd, shell=True)
+
+def gen_semantic_feature(frame_dir, semantic_dir):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    model, preprocess = clip.load("ViT-L/14@336px", device=device)
+    file_names = os.listdir(frame_dir)
+    sorted_file_names = sorted(file_names)
+
+    output_path = semantic_dir / "semantic.npy"
+    features = torch.cuda.FloatTensor(len(sorted_file_names), 768).fill_(0)
+
+    for idx, file_name in enumerate(sorted_file_names):
+        fpath = frame_dir / file_name
+        image = preprocess(Image.open(fpath)).unsqueeze(0).to(device)          
+        with torch.no_grad():
+            image_features = model.encode_image(image)
+        features[idx] = image_features[0]
+    features = features.cpu().numpy()
+    np.save(output_path, features)
+
+def gen_emotion_feature(frame_dir, emotion_dir):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    model, preprocess = clip.load("ViT-L/14@336px", device=device)
+    text = clip.tokenize(["exciting", "fearful", "tense", "sad", "relaxing", "neutral"]).to(device)
+
+    file_names = os.listdir(frame_dir)
+    sorted_file_names = sorted(file_names)
+    output_path = emotion_dir / "emotion.lab" 
+
+    emolist = []
+    for file_name in sorted_file_names:
+        fpath = frame_dir / file_name
+        image = preprocess(Image.open(fpath)).unsqueeze(0).to(device)                
+        with torch.no_grad():  
+            logits_per_image, logits_per_text = model(image, text)
+            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+        fp1 = format(probs[0][0], ".4f")
+        fp2 = format(probs[0][1], ".4f")
+        fp3 = format(probs[0][2], ".4f")
+        fp4 = format(probs[0][3], ".4f")
+        fp5 = format(probs[0][4], ".4f")
+        fp6 = format(probs[0][5], ".4f")
+        
+        emo_val = str(fp1) +" "+ str(fp2) +" "+ str(fp3) +" "+ str(fp4) +" "+ str(fp5) + " " + str(fp6)
+        emolist.append(emo_val)
+    
+    with open(output_path ,'w' ,encoding = 'utf-8') as f:
+        f.write("time exciting_prob fearful_prob tense_prob sad_prob relaxing_prob neutral_prob\n")
+        for i in range(0, len(emolist) ):
+            f.write(str(i) + " "+emolist[i]+"\n")
+
+def gen_scene_feature(video, scene_dir):
+    video_stream = open_video(str(video))
+    
+    scene_manager = SceneManager()
+    scene_manager.add_detector(AdaptiveDetector())
+    scene_manager.detect_scenes(video_stream, show_progress=False)
+    scene_list = scene_manager.get_scene_list()
+
+    sec = 0
+    scenedict = {}
+    for idx, scene in enumerate(scene_list):
+        end_int = math.ceil(scene[1].get_seconds())
+        for s in range (sec, end_int):
+            scenedict[s] = str(idx)
+            sec += 1
+    
+    fpathname = scene_dir / "scene.lab"
+    with open(fpathname,'w',encoding = 'utf-8') as f:
+        for i in range(0, len(scenedict)):
+            f.write(str(i) + " "+scenedict[i]+"\n")
+
+def gen_scene_offset_feature(scene_dir, scene_offset_dir):
+    src = scene_dir / "scene.lab"
+    tgt = scene_offset_dir / "scene_offset.lab"
+    
+    id_list = []
+    with open(src, encoding = 'utf-8') as f:
+        for line in f:
+            line = line.strip()
+            line_arr = line.split(" ")
+            if len(line_arr) == 2 :
+                time = int(line_arr[0])
+                scene_id = int(line_arr[1])
+                id_list.append(scene_id)
+
+    offset_list = []
+    current_id = id_list[0]
+    offset = 0
+    for i in range(len(id_list)):
+        if id_list[i] != current_id:
+            current_id = id_list[i]
+            offset = 0
+        offset_list.append(offset)
+        offset += 1
+    
+    with open(tgt,'w',encoding = 'utf-8') as f:
+        for i in range(0, len(offset_list)):
+            f.write(str(i) + " " + str(offset_list[i]) + "\n")
+
+def gen_motion_feature(video, motion_dir):
+    cap = cv2.VideoCapture(str(video))
+    prev_frame = None
+    prev_time = 0
+    motion_value = 0
+    motiondict = {}
+
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        curr_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
+        motiondict[0] = "0.0000"
+        if prev_frame is not None and curr_time - prev_time >= 1:
+            diff = cv2.absdiff(frame, prev_frame)
+            diff_rgb = cv2.cvtColor(diff, cv2.COLOR_BGR2RGB)
+            motion_value = diff_rgb.mean()
+            motion_value = format(motion_value, ".4f")
+            motiondict[int(curr_time)] = str(motion_value)
+            prev_time = int(curr_time)
+        prev_frame = frame.copy()
+    cap.release()
+    cv2.destroyAllWindows()
+    fpathname = motion_dir / "motion.lab"
+    
+    with open(fpathname,'w',encoding = 'utf-8') as f:
+        for i in range(0, len(motiondict)):
+            f.write(str(i) + " "+motiondict[i]+"\n")
+
+
+# def get_motion_feature(scene_dir, scene_offset_dir):
+# fpath_emotion = emotion_dir / "emotion.lab" 
+# fpath_motion = motion_dir / "motion.lab" 
+
+def get_scene_offset_feature(scene_offset_dir, max_seq_chord=300, max_seq_video=300):
+    feature_scene_offset = np.empty(max_seq_video)
+    feature_scene_offset.fill(SCENE_OFFSET_PAD)
+    fpath_scene_offset = scene_offset_dir / "scene_offset.lab" 
+
+    with open(fpath_scene_offset, encoding = 'utf-8') as f:
+        for line in f:
+            line = line.strip()
+            line_arr = line.split(" ")
+            time = line_arr[0]
+            time = int(time)
+            if time >= max_seq_chord:
+                break
+            sceneID = line_arr[1]
+            feature_scene_offset[time] = int(sceneID)+1
+
+    feature_scene_offset = torch.from_numpy(feature_scene_offset)
+    feature_scene_offset = feature_scene_offset.to(torch.float32)
+
+    return feature_scene_offset
+
+def get_motion_feature(motion_dir, max_seq_chord=300, max_seq_video=300):
+    fpath_motion = motion_dir / "motion.lab" 
+    feature_motion = np.empty(max_seq_video)
+    feature_motion.fill(MOTION_PAD)
+    with open(fpath_motion, encoding = 'utf-8') as f:
+        for line in f:
+            line = line.strip()
+            line_arr = line.split(" ")
+            time = line_arr[0]
+            time = int(time)
+            if time >= max_seq_chord:
+                break
+            motion = line_arr[1]
+            feature_motion[time] = float(motion)
+
+    feature_motion = torch.from_numpy(feature_motion)
+    feature_motion = feature_motion.to(torch.float32)
+    return feature_motion
+
+def get_emotion_feature(emotion_dir, max_seq_chord=300, max_seq_video=300):
+    fpath_emotion = emotion_dir / "emotion.lab" 
+    feature_emotion = np.empty((max_seq_video, 6))
+    feature_emotion.fill(EMOTION_PAD)
+
+    with open(fpath_emotion, encoding = 'utf-8') as f:
+        for line in f:
+            line = line.strip()
+            line_arr = line.split(" ")
+            if line_arr[0] == "time":
+                continue
+            time = line_arr[0]
+            time = int(time)
+            if time >= max_seq_chord:
+                break
+            emo1, emo2, emo3, emo4, emo5, emo6 = \
+                line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5],line_arr[6]                    
+            emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5), float(emo6) ]
+            emoList = np.array(emoList)
+            feature_emotion[time] = emoList
+
+    feature_emotion = torch.from_numpy(feature_emotion)
+    feature_emotion = feature_emotion.to(torch.float32)
+    return feature_emotion
+
+def get_semantic_feature(semantic_dir, max_seq_chord=300, max_seq_video=300):
+    fpath_semantic = semantic_dir / "semantic.npy" 
+    
+    video_feature = np.load(fpath_semantic)
+    dim_vf = video_feature.shape[1]
+
+    video_feature_tensor = torch.from_numpy( video_feature )
+    feature_semantic = torch.full((max_seq_video, dim_vf,), SEMANTIC_PAD , dtype=torch.float32, device=torch.device("cpu"))
+
+    if(video_feature_tensor.shape[0] < max_seq_video):
+        feature_semantic[:video_feature_tensor.shape[0]] = video_feature_tensor
+    else:
+        feature_semantic = video_feature_tensor[:max_seq_video]
+    
+    return feature_semantic
+
+
+def text_clip(text: str, duration: int, start_time: int = 0):
+    t = TextClip(text, font='Georgia-Regular', fontsize=24, color='white')
+    t = t.set_position(("center", 20)).set_duration(duration)
+    t = t.set_start(start_time)
+    return t
+
+def convert_format_id_to_offset(id_list):
+    offset_list = []
+    current_id = id_list[0]
+    offset = 0
+    for i in range(len(id_list)):
+        if id_list[i] != current_id:
+            current_id = id_list[i]
+            offset = 0
+        offset_list.append(offset)
+        offset += 1
+    return offset_list
+
+
+class Video2music:
+    def __init__(
+        self,
+        name="amaai-lab/video2music",
+        device="cuda:0",
+        cache_dir=None,
+        local_files_only=False,
+    ):
+        # path = snapshot_download(repo_id=name, cache_dir=cache_dir)
+
+        self.device = device
+        
+        # self.model.device = device
+        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        # f"{path}/beats/microsoft-deberta-v3-large.pt"
+
+        # self.model_weights = f"{path}/saved_models/AMT/best_loss_weights.pickle"
+        # self.modelReg_weights = f"{path}/saved_models/AMT/best_rmse_weights.pickle"
+
+        self.model_weights = "saved_models/AMT/best_loss_weights.pickle"
+        self.modelReg_weights = "saved_models/AMT/best_rmse_weights.pickle"
+
+        self.total_vf_dim = 776
+        # 768 (sem) + 1 (mo) + 1 (scene) + 6 (emo)
+        self.max_seq_video = 300
+        self.max_seq_chord = 300
+        
+        self.model = VideoMusicTransformer(n_layers=6, num_heads=8,
+                    d_model=512, dim_feedforward=1024,
+                    max_sequence_midi=2048, max_sequence_video=300, 
+                    max_sequence_chord=300, total_vf_dim=self.total_vf_dim, rpr=RPR).to(device)
+        
+        self.model.load_state_dict(torch.load(self.model_weights, map_location=device))
+        self.modelReg = VideoRegression(max_sequence_video=300, total_vf_dim=self.total_vf_dim, regModel= "bigru").to(device)
+        self.modelReg.load_state_dict(torch.load(self.modelReg_weights, map_location=device))
+
+        self.model.eval()
+        self.modelReg.eval()
+
+        self.SF2_FILE = "default_sound_font.sf2"
+
+    def generate(self, video, primer, key):
+
+        feature_dir = Path("./feature")
+        output_dir = Path("./output")
+        if feature_dir.exists():
+            shutil.rmtree(str(feature_dir))
+        if output_dir.exists():
+            shutil.rmtree(str(output_dir))
+        
+        feature_dir.mkdir(parents=True)
+        output_dir.mkdir(parents=True)
+        
+        frame_dir = feature_dir / "vevo_frame"
+
+        #video features
+        semantic_dir = feature_dir / "vevo_semantic"
+        emotion_dir = feature_dir / "vevo_emotion"
+        scene_dir = feature_dir / "vevo_scene"
+        scene_offset_dir = feature_dir / "vevo_scene_offset"
+        motion_dir = feature_dir / "vevo_motion"
+
+        frame_dir.mkdir(parents=True)
+        semantic_dir.mkdir(parents=True)
+        emotion_dir.mkdir(parents=True)
+        scene_dir.mkdir(parents=True)
+        scene_offset_dir.mkdir(parents=True)
+        motion_dir.mkdir(parents=True)
+        
+        #music features
+        chord_dir = feature_dir / "vevo_chord"
+        loudness_dir = feature_dir / "vevo_loudness"
+        note_density_dir = feature_dir / "vevo_note_density"
+        
+        chord_dir.mkdir(parents=True)
+        loudness_dir.mkdir(parents=True)
+        note_density_dir.mkdir(parents=True)
+
+        split_video_into_frames(video, frame_dir)
+        gen_semantic_feature(frame_dir, semantic_dir)
+        gen_emotion_feature(frame_dir, emotion_dir)
+        gen_scene_feature(video, scene_dir)
+        gen_scene_offset_feature(scene_dir, scene_offset_dir)
+        gen_motion_feature(video, motion_dir)
+
+        feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
+        feature_motion = get_motion_feature(motion_dir)
+        feature_emotion = get_emotion_feature(emotion_dir)
+        feature_semantic = get_semantic_feature(semantic_dir)
+
+        # cuda
+        feature_scene_offset = feature_scene_offset.to(self.device)
+        feature_motion = feature_motion.to(self.device)
+        feature_emotion = feature_emotion.to(self.device)
+
+        feature_scene_offset = feature_scene_offset.unsqueeze(0)
+        feature_motion = feature_motion.unsqueeze(0)
+        feature_emotion = feature_emotion.unsqueeze(0)
+
+        feature_semantic = feature_semantic.to(self.device)
+        feature_semantic_list = []
+        feature_semantic = torch.unsqueeze(feature_semantic, 0)
+        feature_semantic_list.append( feature_semantic.to(self.device) )
+        #feature_semantic_list.append( feature_semantic )
+
+        if "major" in key:
+            feature_key = torch.tensor([0])
+            feature_key = feature_key.float()
+        elif "minor" in key:
+            feature_key = torch.tensor([1])
+            feature_key = feature_key.float()
+        
+        feature_key = feature_key.to(self.device)
+
+        with open('dataset/vevo_meta/chord.json') as json_file:
+            chordDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_inv.json') as json_file:
+            chordInvDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_root.json') as json_file:
+            chordRootDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_attr.json') as json_file:
+            chordAttrDic = json.load(json_file)
+        
+        if primer.strip() == "":
+            if "major" in key:
+                primer = "C"
+            else:
+                primer = "Am"
+        
+        pChordList = primer.split(" ")
+
+        primerCID = []
+        primerCID_root = []
+        primerCID_attr = []
+        
+        for pChord in pChordList:
+            if len(pChord) > 1:
+                if pChord[1] == "b":
+                    pChord = flatsharpDic [ pChord[0:2] ] + pChord[2:]
+                type_idx = 0
+                if pChord[1] == "#":
+                    pChord = pChord[0:2] + ":" + pChord[2:]
+                    type_idx = 2
+                else:
+                    pChord = pChord[0:1] + ":" + pChord[1:]
+                    type_idx = 1
+                if pChord[type_idx+1:] == "m":
+                    pChord = pChord[0:type_idx] + ":min"
+                if pChord[type_idx+1:] == "m6":
+                    pChord = pChord[0:type_idx] + ":min6"
+                if pChord[type_idx+1:] == "m7":
+                    pChord = pChord[0:type_idx] + ":min7"
+                if pChord[type_idx+1:] == "M6":
+                    pChord = pChord[0:type_idx] + ":maj6"
+                if pChord[type_idx+1:] == "M7":
+                    pChord = pChord[0:type_idx] + ":maj7"
+                if pChord[type_idx+1:] == "":
+                    pChord = pChord[0:type_idx]
+            
+            chordID = chordDic[pChord]
+            primerCID.append(chordID)
+
+            chord_arr = pChord.split(":")
+            if len(chord_arr) == 1:
+                chordRootID = chordRootDic[chord_arr[0]]
+                primerCID_root.append(chordRootID)
+                primerCID_attr.append(0)
+            elif len(chord_arr) == 2:
+                chordRootID = chordRootDic[chord_arr[0]]
+                chordAttrID = chordAttrDic[chord_arr[1]]
+                primerCID_root.append(chordRootID)
+                primerCID_attr.append(chordAttrID)
+        
+        primerCID = np.array(primerCID)
+        primerCID = torch.from_numpy(primerCID)
+        primerCID = primerCID.to(torch.long)
+        primerCID = primerCID.to(self.device)
+
+        primerCID_root = np.array(primerCID_root)
+        primerCID_root = torch.from_numpy(primerCID_root)
+        primerCID_root = primerCID_root.to(torch.long)
+        primerCID_root = primerCID_root.to(self.device)
+        
+        primerCID_attr = np.array(primerCID_attr)
+        primerCID_attr = torch.from_numpy(primerCID_attr)
+        primerCID_attr = primerCID_attr.to(torch.long)
+        primerCID_attr = primerCID_attr.to(self.device)
+
+        # self.model.eval()
+        # self.modelReg.eval()
+
+        with torch.set_grad_enabled(False):
+            rand_seq = self.model.generate(feature_semantic_list=feature_semantic_list, 
+                                              feature_key=feature_key, 
+                                              feature_scene_offset=feature_scene_offset,
+                                              feature_motion=feature_motion,
+                                              feature_emotion=feature_emotion,
+                                              primer = primerCID, 
+                                              primer_root = primerCID_root,
+                                              primer_attr = primerCID_attr,
+                                              target_seq_length = 300, 
+                                              beam=0,
+                                              max_conseq_N= max_conseq_N,
+                                              max_conseq_chord = max_conseq_chord)
+            
+            y = self.modelReg(
+                        feature_semantic_list, 
+                        feature_scene_offset,
+                        feature_motion,
+                        feature_emotion)
+        
+            y   = y.reshape(y.shape[0] * y.shape[1], -1)
+
+            y_note_density, y_loudness = torch.split(y, split_size_or_sections=1, dim=1)
+            y_note_density_np = y_note_density.cpu().numpy()
+            y_note_density_np = np.round(y_note_density_np).astype(int)
+            y_note_density_np = np.clip(y_note_density_np, 0, 40)
+
+            y_loudness_np = y_loudness.cpu().numpy()
+            y_loudness_np_lv = (y_loudness_np * 100).astype(int)
+            y_loudness_np_lv = np.clip(y_loudness_np_lv, 0, 50)
+            velolistExp = []
+            exponent = 0.3
+            for item in y_loudness_np_lv:
+                loudness = item[0]
+                velocity_exp = np.round(((loudness - min_loudness) / (max_loudness - min_loudness)) ** exponent * (max_velocity - min_velocity) + min_velocity)
+                velocity_exp = int(velocity_exp)
+                velolistExp.append(velocity_exp)
+            
+            densitylist = []
+            for item in y_loudness_np_lv:
+                density = item[0]
+                if density <= 6:
+                    densitylist.append(0)
+                elif density <= 12:
+                    densitylist.append(1)
+                elif density <= 18:
+                    densitylist.append(2)
+                elif density <= 24:
+                    densitylist.append(3)
+                else:
+                    densitylist.append(4)
+            
+            # generated ChordID to ChordSymbol
+            chord_genlist = []
+            chordID_genlist= rand_seq[0].cpu().numpy()
+            for i in chordID_genlist:
+                chord_genlist.append(chordInvDic[str(i)])
+            
+            chord_offsetlist = convert_format_id_to_offset(chord_genlist)
+            f_path_midi = output_dir / "output.mid"
+            f_path_flac = output_dir / "output.flac"
+            f_path_video_out = output_dir / "output.mp4"
+
+            # ChordSymbol to MIDI file with voicing
+            MIDI = MIDIFile(1)
+            MIDI.addTempo(0, 0, tempo)
+            midi_chords_orginal = []
+            for i, k in enumerate(chord_genlist):
+                k = k.replace(":", "")
+                if k == "N":
+                    midi_chords_orginal.append([])
+                else:
+                    midi_chords_orginal.append(Chord(k).getMIDI("c", 4))
+            midi_chords = voice(midi_chords_orginal)
+            trans = traspose_key_dic[key]
+
+            for i, chord in enumerate(midi_chords):
+                if densitylist[i] == 0:
+                    if len(chord) >= 4:
+                        if chord_offsetlist[i] % 2 == 0:
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                        else:
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                elif densitylist[i] == 1:
+                    if len(chord) >= 4:
+                        if chord_offsetlist[i] % 2 == 0:
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                        else:
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                elif densitylist[i] == 2:
+                    if len(chord) >= 4:
+                        if chord_offsetlist[i] % 2 == 0:
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                        else:
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                elif densitylist[i] == 3:
+                    if len(chord) >= 4:
+                        if chord_offsetlist[i] % 2 == 0:
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.75 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                        else:
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0.75 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                elif densitylist[i] == 4:
+                    if len(chord) >= 4:
+                        if chord_offsetlist[i] % 2 == 0:
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.75 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.75 ,  duration,  velolistExp[i])
+                        else:
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[0]+trans,  i * duration + 0.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 0.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 0.75 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[3]+trans,  i * duration + 1 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.25 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[1]+trans,  i * duration + 1.5 ,  duration,  velolistExp[i])
+                            MIDI.addNote(0, 0, chord[2]+trans,  i * duration + 1.75 ,  duration,  velolistExp[i])
+            
+            with open(f_path_midi, "wb") as outputFile:
+                MIDI.writeFile(outputFile)
+            
+            # Convert midi to audio (e.g., flac)
+            fs = FluidSynth(sound_font=self.SF2_FILE)
+            fs.midi_to_audio(str(f_path_midi), str(f_path_flac))
+
+            # Render generated music into input video
+            audio_mp = mp.AudioFileClip(str(f_path_flac))
+            video_mp = mp.VideoFileClip(str(video))
+
+            audio_mp = audio_mp.subclip(0, video_mp.duration )
+            final = video_mp.set_audio(audio_mp)
+
+            final.write_videofile(str(f_path_video_out), 
+                codec='libx264', 
+                audio_codec='aac', 
+                temp_audiofile='temp-audio.m4a', 
+                remove_temp=True
+            )
+            return Path(str(f_path_video_out))
+        
+
+# Initialize Mustango
+if torch.cuda.is_available():
+    video2music = Video2music()
+else:
+    video2music = Video2music(device="cpu")
+
+
+def gradio_generate(input_video, input_primer, input_key):
+    output_filename = video2music.generate(input_video, input_primer, input_key)
+    return str(output_filename)
+
+
+title="Video2Music: Suitable Music Generation from Videos using an Affective Multimodal Transformer model"
+description_text = """
+<p>
+Generate background music using Video2Music by providing an input video.
+<br/><br/> This is the demo for Video2Music: Suitable Music Generation from Videos using an Affective Multimodal Transformer model
+<a href="https://arxiv.org/abs/2311.00968">Read our paper.</a>
+<p/>
+"""
+input_video = gr.Video(label="Input Video")
+input_primer = gr.Textbox(label="Input Primer", value="C Am F G")
+input_key = gr.Dropdown(choices=["C major", "A minor"], value="C major", label="Input Key")
+output_video = gr.Video(label="Output Video")
+
+css = '''
+#duplicate-button {
+margin: auto;
+color: white;
+background: #1565c0;
+border-radius: 100vh;
+}
+'''
+
+# Gradio interface
+gr_interface = gr.Interface(
+    fn=gradio_generate,
+    inputs=[input_video, input_primer, input_key ],
+    outputs=[output_video],
+    description=description_text,
+    allow_flagging='never',
+    cache_examples=True,
+)
+
+
+# with gr.Blocks() as demo:
+with gr.Blocks(css=css) as demo:
+    title=gr.HTML(f"<h1><center>{title}</center></h1>")
+    gr_interface.render()
+
+#demo.queue()
+# demo.launch(debug=True)
+
+demo.queue().launch()
 
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()
\ No newline at end of file
diff --git a/dataset/README.md b/dataset/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8665e83e25d91c7bce9b6351679a6866b152df11
--- /dev/null
+++ b/dataset/README.md
@@ -0,0 +1,31 @@
+# MuVi-Sync Dataset
+
+- Dataset (MuVi-Sync)
+  * MuVi-Sync (features) [(Link)](https://zenodo.org/records/10057093)
+  * MuVi-Sync (original video) [(Link)](https://zenodo.org/records/10050294)
+
+## Overview
+Welcome to the MuVi-Sync dataset! This collection provides a rich array of features for both music and video elements. Here's a breakdown of the directory structure:
+
+### Music Features
+- **vevo_chord:** Chord feature data
+- **vevo_note_density:** Note density feature data
+- **vevo_loudness:** Loudness feature data
+
+### Video Features
+- **vevo_scene_offset:** Scene offset feature data
+- **vevo_emotion:** Emotion feature data
+  - *5c_l14p:* 5 emotion categories (exciting, fearful, tense, sad, relaxing)
+  - *6c_l14p:* 6 emotion categories (exciting, fearful, tense, sad, relaxing, neutral)
+- **vevo_semantic:** Semantic feature
+- **vevo_motion:** Motion feature
+
+### Others
+- **vevo_meta:**
+  - *idlist.txt:* List of features, titles, and YouTube IDs
+- **vevo:** Original video files (.mp4)
+ 
+Explore and utilize this dataset for innovative research and applications. 
+
+For more details, refer to our [GitHub repository](https://github.com/AMAAI-Lab/Video2Music).
+
diff --git a/dataset/__pycache__/vevo_dataset.cpython-37.pyc b/dataset/__pycache__/vevo_dataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba60ba07fae3faba4dfc512743065182986e39f0
Binary files /dev/null and b/dataset/__pycache__/vevo_dataset.cpython-37.pyc differ
diff --git a/dataset/vevo_dataset.py b/dataset/vevo_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0373f5e15120481da4ea524d254ba121000e762a
--- /dev/null
+++ b/dataset/vevo_dataset.py
@@ -0,0 +1,720 @@
+import os
+import pickle
+import random
+import torch
+import torch.nn as nn
+import numpy as np
+
+from torch.utils.data import Dataset
+from utilities.constants import *
+from utilities.device import cpu_device
+from utilities.device import get_device
+
+import json
+
+SEQUENCE_START = 0
+
+class VevoDataset(Dataset):
+    def __init__(self, dataset_root = "./dataset/", split="train", split_ver="v1", vis_models="2d/clip_l14p", emo_model="6c_l14p", max_seq_chord=300, max_seq_video=300, random_seq=True, is_video = True):
+        
+        self.dataset_root       = dataset_root
+
+        self.vevo_chord_root = os.path.join( dataset_root, "vevo_chord", "lab_v2_norm", "all")
+        self.vevo_emotion_root = os.path.join( dataset_root, "vevo_emotion", emo_model, "all")
+        self.vevo_motion_root = os.path.join( dataset_root, "vevo_motion", "all")
+        self.vevo_scene_offset_root = os.path.join( dataset_root, "vevo_scene_offset", "all")
+        self.vevo_meta_split_path = os.path.join( dataset_root, "vevo_meta", "split", split_ver, split + ".txt")
+        
+        self.vevo_loudness_root = os.path.join( dataset_root, "vevo_loudness", "all")
+        self.vevo_note_density_root = os.path.join( dataset_root, "vevo_note_density", "all")
+
+        self.max_seq_video    = max_seq_video
+        self.max_seq_chord    = max_seq_chord
+        self.random_seq = random_seq
+        self.is_video = is_video
+
+        self.vis_models_arr = vis_models.split(" ")
+        self.vevo_semantic_root_list = []
+        self.id_list = []
+
+        self.emo_model = emo_model
+
+        if IS_VIDEO:
+            for i in range( len(self.vis_models_arr) ):
+                p1 = self.vis_models_arr[i].split("/")[0]
+                p2 = self.vis_models_arr[i].split("/")[1]
+                vevo_semantic_root = os.path.join(dataset_root, "vevo_semantic" , "all" , p1, p2)
+                self.vevo_semantic_root_list.append( vevo_semantic_root )
+            
+        with open( self.vevo_meta_split_path ) as f:
+            for line in f:
+                self.id_list.append(line.strip())
+        
+        self.data_files_chord = []      
+        self.data_files_emotion = []
+        self.data_files_motion = []
+        self.data_files_scene_offset = []
+        self.data_files_semantic_list = []
+
+        self.data_files_loudness = []
+        self.data_files_note_density = []
+
+        for i in range(len(self.vis_models_arr)):
+            self.data_files_semantic_list.append([])
+
+        for fid in self.id_list:
+            fpath_chord = os.path.join( self.vevo_chord_root, fid + ".lab" )
+            fpath_emotion = os.path.join( self.vevo_emotion_root, fid + ".lab" )
+            fpath_motion = os.path.join( self.vevo_motion_root, fid + ".lab" )
+            fpath_scene_offset = os.path.join( self.vevo_scene_offset_root, fid + ".lab" )
+
+            fpath_loudness = os.path.join( self.vevo_loudness_root, fid + ".lab" )
+            fpath_note_density = os.path.join( self.vevo_note_density_root, fid + ".lab" )
+
+            fpath_semantic_list = []
+            for vevo_semantic_root in self.vevo_semantic_root_list:
+                fpath_semantic = os.path.join( vevo_semantic_root, fid + ".npy" )
+                fpath_semantic_list.append(fpath_semantic)
+            
+            checkFile_semantic = True
+            for fpath_semantic in fpath_semantic_list:
+                if not os.path.exists(fpath_semantic):
+                    checkFile_semantic = False
+            
+            checkFile_chord = os.path.exists(fpath_chord)
+            checkFile_emotion = os.path.exists(fpath_emotion)
+            checkFile_motion = os.path.exists(fpath_motion)
+            checkFile_scene_offset = os.path.exists(fpath_scene_offset)
+
+            checkFile_loudness = os.path.exists(fpath_loudness)
+            checkFile_note_density = os.path.exists(fpath_note_density)
+
+            if checkFile_chord and checkFile_emotion and checkFile_motion \
+                and checkFile_scene_offset and checkFile_semantic and checkFile_loudness and checkFile_note_density :
+
+                self.data_files_chord.append(fpath_chord)
+                self.data_files_emotion.append(fpath_emotion)
+                self.data_files_motion.append(fpath_motion)
+                self.data_files_scene_offset.append(fpath_scene_offset)
+
+                self.data_files_loudness.append(fpath_loudness)
+                self.data_files_note_density.append(fpath_note_density)
+
+                if IS_VIDEO:
+                    for i in range(len(self.vis_models_arr)):
+                        self.data_files_semantic_list[i].append( fpath_semantic_list[i] )
+        
+        chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+        
+        chordRootDicPath = os.path.join( dataset_root, "vevo_meta/chord_root.json")
+        chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+        
+        with open(chordDicPath) as json_file:
+            self.chordDic = json.load(json_file)
+        
+        with open(chordRootDicPath) as json_file:
+            self.chordRootDic = json.load(json_file)
+        
+        with open(chordAttrDicPath) as json_file:
+            self.chordAttrDic = json.load(json_file)
+        
+    def __len__(self):
+        return len(self.data_files_chord)
+
+    def __getitem__(self, idx):
+        #### ---- CHORD ----- ####
+        feature_chord = np.empty(self.max_seq_chord)
+        feature_chord.fill(CHORD_PAD)
+
+        feature_chordRoot = np.empty(self.max_seq_chord)
+        feature_chordRoot.fill(CHORD_ROOT_PAD)
+        feature_chordAttr = np.empty(self.max_seq_chord)
+        feature_chordAttr.fill(CHORD_ATTR_PAD)
+
+        key = ""
+        with open(self.data_files_chord[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                if line_arr[0] == "key":
+                    key = line_arr[1] + " "+ line_arr[2]
+                    continue
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+                chord = line_arr[1]
+                chordID = self.chordDic[chord]
+                feature_chord[time] = chordID
+                chord_arr = chord.split(":")
+
+                if len(chord_arr) == 1:
+                    if chord_arr[0] == "N":
+                        chordRootID = self.chordRootDic["N"]
+                        chordAttrID = self.chordAttrDic["N"]
+                        feature_chordRoot[time] = chordRootID
+                        feature_chordAttr[time] = chordAttrID
+                    else:
+                        chordRootID = self.chordRootDic[chord_arr[0]]
+                        feature_chordRoot[time] = chordRootID
+                        feature_chordAttr[time] = 1
+                elif len(chord_arr) == 2:
+                    chordRootID = self.chordRootDic[chord_arr[0]]
+                    chordAttrID = self.chordAttrDic[chord_arr[1]]
+                    feature_chordRoot[time] = chordRootID
+                    feature_chordAttr[time] = chordAttrID
+
+        if "major" in key:
+            feature_key = torch.tensor([0])
+        else:
+            feature_key = torch.tensor([1])
+
+        feature_chord = torch.from_numpy(feature_chord)
+        feature_chord = feature_chord.to(torch.long)
+        
+        feature_chordRoot = torch.from_numpy(feature_chordRoot)
+        feature_chordRoot = feature_chordRoot.to(torch.long)
+
+        feature_chordAttr = torch.from_numpy(feature_chordAttr)
+        feature_chordAttr = feature_chordAttr.to(torch.long)
+
+        feature_key = feature_key.float()
+        
+        x = feature_chord[:self.max_seq_chord-1]
+        tgt = feature_chord[1:self.max_seq_chord]
+
+        x_root = feature_chordRoot[:self.max_seq_chord-1]
+        tgt_root = feature_chordRoot[1:self.max_seq_chord]
+        x_attr = feature_chordAttr[:self.max_seq_chord-1]
+        tgt_attr = feature_chordAttr[1:self.max_seq_chord]
+
+        if time < self.max_seq_chord:
+            tgt[time] = CHORD_END
+            tgt_root[time] = CHORD_ROOT_END
+            tgt_attr[time] = CHORD_ATTR_END
+        
+        #### ---- SCENE OFFSET ----- ####
+        feature_scene_offset = np.empty(self.max_seq_video)
+        feature_scene_offset.fill(SCENE_OFFSET_PAD)
+        with open(self.data_files_scene_offset[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+                sceneID = line_arr[1]
+                feature_scene_offset[time] = int(sceneID)+1
+
+        feature_scene_offset = torch.from_numpy(feature_scene_offset)
+        feature_scene_offset = feature_scene_offset.to(torch.float32)
+
+        #### ---- MOTION ----- ####
+        feature_motion = np.empty(self.max_seq_video)
+        feature_motion.fill(MOTION_PAD)
+        with open(self.data_files_motion[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+                motion = line_arr[1]
+                feature_motion[time] = float(motion)
+
+        feature_motion = torch.from_numpy(feature_motion)
+        feature_motion = feature_motion.to(torch.float32)
+
+        #### ---- NOTE_DENSITY ----- ####
+        feature_note_density = np.empty(self.max_seq_video)
+        feature_note_density.fill(NOTE_DENSITY_PAD)
+        with open(self.data_files_note_density[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+                note_density = line_arr[1]
+                feature_note_density[time] = float(note_density)
+
+        feature_note_density = torch.from_numpy(feature_note_density)
+        feature_note_density = feature_note_density.to(torch.float32)
+
+        #### ---- LOUDNESS ----- ####
+        feature_loudness = np.empty(self.max_seq_video)
+        feature_loudness.fill(LOUDNESS_PAD)
+        with open(self.data_files_loudness[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+                loudness = line_arr[1]
+                feature_loudness[time] = float(loudness)
+
+        feature_loudness = torch.from_numpy(feature_loudness)
+        feature_loudness = feature_loudness.to(torch.float32)
+
+        #### ---- EMOTION ----- ####
+        if self.emo_model.startswith("6c"):
+            feature_emotion = np.empty( (self.max_seq_video, 6))
+        else:
+            feature_emotion = np.empty( (self.max_seq_video, 5))
+
+        feature_emotion.fill(EMOTION_PAD)
+        with open(self.data_files_emotion[idx], encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                if line_arr[0] == "time":
+                    continue
+                time = line_arr[0]
+                time = int(time)
+                if time >= self.max_seq_chord:
+                    break
+
+                if len(line_arr) == 7:
+                    emo1, emo2, emo3, emo4, emo5, emo6 = \
+                        line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5],line_arr[6]                    
+                    emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5), float(emo6) ]
+                elif len(line_arr) == 6:
+                    emo1, emo2, emo3, emo4, emo5 = \
+                        line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5]
+                    emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5) ]
+                
+                emoList = np.array(emoList)
+                feature_emotion[time] = emoList
+
+        feature_emotion = torch.from_numpy(feature_emotion)
+        feature_emotion = feature_emotion.to(torch.float32)
+
+        feature_emotion_argmax = torch.argmax(feature_emotion, dim=1)
+        _, max_prob_indices = torch.max(feature_emotion, dim=1)
+        max_prob_values = torch.gather(feature_emotion, dim=1, index=max_prob_indices.unsqueeze(1))
+        max_prob_values = max_prob_values.squeeze()
+
+        # -- emotion to chord
+        #              maj dim sus4 min7 min sus2 aug dim7 maj6 hdim7 7 min6 maj7
+        # 0. extcing : [1,0,1,0,0,0,0,0,0,0,1,0,0]
+        # 1. fearful : [0,1,0,1,0,0,0,1,0,1,0,0,0]
+        # 2. tense :   [0,1,1,1,0,0,0,0,0,0,1,0,0]
+        # 3. sad :     [0,0,0,1,1,1,0,0,0,0,0,0,0]
+        # 4. relaxing: [1,0,0,0,0,0,0,0,1,0,0,0,1]
+        # 5. neutral : [0,0,0,0,0,0,0,0,0,0,0,0,0]
+
+        a0 = [0]+[1,0,1,0,0,0,0,0,0,0,1,0,0]*12+[0,0]
+        a1 = [0]+[0,1,0,1,0,0,0,1,0,1,0,0,0]*12+[0,0]
+        a2 = [0]+[0,1,1,1,0,0,0,0,0,0,1,0,0]*12+[0,0]
+        a3 = [0]+[0,0,0,1,1,1,0,0,0,0,0,0,0]*12+[0,0]
+        a4 = [0]+[1,0,0,0,0,0,0,0,1,0,0,0,1]*12+[0,0]
+        a5 = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,0]
+
+        aend = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[1,0]
+        apad = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,1]
+
+        a0_tensor = torch.tensor(a0)
+        a1_tensor = torch.tensor(a1)
+        a2_tensor = torch.tensor(a2)
+        a3_tensor = torch.tensor(a3)
+        a4_tensor = torch.tensor(a4)
+        a5_tensor = torch.tensor(a5)
+
+        aend_tensor = torch.tensor(aend)
+        apad_tensor = torch.tensor(apad)
+
+        mapped_tensor = torch.zeros((300, 159))
+        for i, val in enumerate(feature_emotion_argmax):
+            if feature_chord[i] == CHORD_PAD:
+                mapped_tensor[i] = apad_tensor
+            elif feature_chord[i] == CHORD_END:
+                mapped_tensor[i] = aend_tensor
+            elif val == 0:
+                mapped_tensor[i] = a0_tensor
+            elif val == 1:
+                mapped_tensor[i] = a1_tensor
+            elif val == 2:
+                mapped_tensor[i] = a2_tensor
+            elif val == 3:
+                mapped_tensor[i] = a3_tensor
+            elif val == 4:
+                mapped_tensor[i] = a4_tensor
+            elif val == 5:
+                mapped_tensor[i] = a5_tensor
+
+        # feature emotion : [1, 300, 6]
+        # y : [299, 159]
+        # tgt : [299]
+        # tgt_emo : [299, 159]
+        # tgt_emo_prob : [299]
+
+        tgt_emotion = mapped_tensor[1:]
+        tgt_emotion_prob = max_prob_values[1:]
+        
+        feature_semantic_list = []
+        if self.is_video:
+            for i in range( len(self.vis_models_arr) ):
+                video_feature = np.load(self.data_files_semantic_list[i][idx])
+                dim_vf = video_feature.shape[1] # 2048
+                video_feature_tensor = torch.from_numpy( video_feature )
+                
+                feature_semantic = torch.full((self.max_seq_video, dim_vf,), SEMANTIC_PAD , dtype=torch.float32, device=cpu_device())
+                if(video_feature_tensor.shape[0] < self.max_seq_video):
+                    feature_semantic[:video_feature_tensor.shape[0]] = video_feature_tensor
+                else:
+                    feature_semantic = video_feature_tensor[:self.max_seq_video]
+                feature_semantic_list.append(feature_semantic)
+
+        return { "x":x, 
+                "tgt":tgt, 
+                "x_root":x_root, 
+                "tgt_root":tgt_root, 
+                "x_attr":x_attr, 
+                "tgt_attr":tgt_attr,
+                "semanticList": feature_semantic_list, 
+                "key": feature_key,
+                "scene_offset": feature_scene_offset,
+                "motion": feature_motion,
+                "emotion": feature_emotion,
+                "tgt_emotion" : tgt_emotion,
+                "tgt_emotion_prob" : tgt_emotion_prob,
+                "note_density" : feature_note_density,
+                "loudness" : feature_loudness
+                }
+
+def create_vevo_datasets(dataset_root = "./dataset", max_seq_chord=300, max_seq_video=300, vis_models="2d/clip_l14p", emo_model="6c_l14p", split_ver="v1", random_seq=True, is_video=True):
+
+    train_dataset = VevoDataset(
+        dataset_root = dataset_root, split="train", split_ver=split_ver, 
+        vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, 
+        random_seq=random_seq, is_video = is_video )
+    
+    val_dataset = VevoDataset(
+        dataset_root = dataset_root, split="val", split_ver=split_ver, 
+        vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, 
+        random_seq=random_seq, is_video = is_video )
+    
+    test_dataset = VevoDataset(
+        dataset_root = dataset_root, split="test", split_ver=split_ver, 
+        vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video, 
+        random_seq=random_seq, is_video = is_video )
+    
+    return train_dataset, val_dataset, test_dataset
+
+def compute_vevo_accuracy(out, tgt):
+    softmax = nn.Softmax(dim=-1)
+    out = torch.argmax(softmax(out), dim=-1)
+
+    out = out.flatten()
+    tgt = tgt.flatten()
+
+    mask = (tgt != CHORD_PAD)
+
+    out = out[mask]
+    tgt = tgt[mask]
+
+    if(len(tgt) == 0):
+        return 1.0
+
+    num_right = (out == tgt)
+    num_right = torch.sum(num_right).type(TORCH_FLOAT)
+
+    acc = num_right / len(tgt)
+
+    return acc
+
+def compute_hits_k(out, tgt, k):
+    softmax = nn.Softmax(dim=-1)
+    out = softmax(out)
+    _, topk_indices = torch.topk(out, k, dim=-1)  # Get the indices of top-k values
+
+    tgt = tgt.flatten()
+
+    topk_indices = torch.squeeze(topk_indices, dim = 0)
+
+    num_right = 0 
+    pt = 0
+    for i, tlist in enumerate(topk_indices):
+        if tgt[i] == CHORD_PAD:
+            num_right += 0
+        else:
+            pt += 1 
+            if tgt[i].item() in tlist:
+                num_right += 1
+
+    # Empty
+    if len(tgt) == 0:
+        return 1.0
+    
+    num_right = torch.tensor(num_right, dtype=torch.float32)
+    hitk = num_right / pt
+
+    return hitk
+
+def compute_hits_k_root_attr(out_root, out_attr, tgt, k):
+    softmax = nn.Softmax(dim=-1)
+    out_root = softmax(out_root)
+    out_attr = softmax(out_attr)
+
+    tensor_shape = torch.Size([1, 299, 159])
+    out = torch.zeros(tensor_shape)
+    for i in range(out.shape[-1]):
+        if i == 0 :
+            out[0, :, i] = out_root[0, :, 0] * out_attr[0, :, 0] 
+        elif i == 157:
+            out[0, :, i] = out_root[0, :, 13] * out_attr[0, :, 14]
+        elif i == 158:
+            out[0, :, i] = out_root[0, :, 14] * out_attr[0, :, 15]
+        else:
+            rootindex =  int( (i-1)/13 ) + 1
+            attrindex =  (i-1)%13 + 1
+            out[0, :, i] = out_root[0, :, rootindex] * out_attr[0, :, attrindex]
+
+    out = softmax(out)
+    _, topk_indices = torch.topk(out, k, dim=-1)  # Get the indices of top-k values
+
+    tgt = tgt.flatten()
+
+    topk_indices = torch.squeeze(topk_indices, dim = 0)
+
+    num_right = 0 
+    pt = 0
+    for i, tlist in enumerate(topk_indices):
+        if tgt[i] == CHORD_PAD:
+            num_right += 0
+        else:
+            pt += 1 
+            if tgt[i].item() in tlist:
+                num_right += 1
+
+    if len(tgt) == 0:
+        return 1.0
+    
+    num_right = torch.tensor(num_right, dtype=torch.float32)
+    hitk = num_right / pt
+
+    return hitk
+
+def compute_vevo_correspondence(out, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold):
+
+    tgt_emotion = tgt_emotion.squeeze()
+    tgt_emotion_prob = tgt_emotion_prob.squeeze()
+
+    dataset_root = "./dataset/"
+    chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+    chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+    chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+    
+    chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+    chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json")
+
+    with open(chordRootInvDicPath) as json_file:
+        chordRootInvDic = json.load(json_file)
+    with open(chordAttrDicPath) as json_file:
+        chordAttrDic = json.load(json_file)
+    with open(chordAttrInvDicPath) as json_file:
+        chordAttrInvDic = json.load(json_file)
+    with open(chordDicPath) as json_file:
+        chordDic = json.load(json_file)
+    with open(chordInvDicPath) as json_file:
+        chordInvDic = json.load(json_file)
+
+    softmax = nn.Softmax(dim=-1)
+    out = torch.argmax(softmax(out), dim=-1)
+    out = out.flatten()
+
+    tgt = tgt.flatten()
+
+    num_right = 0
+    tgt_emotion_quality = tgt_emotion[:, 0:14]
+    pt = 0 
+    for i, out_element in enumerate( out ):
+
+        all_zeros = torch.all(tgt_emotion_quality[i] == 0)
+        if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold:
+            num_right += 0
+        else:
+            pt += 1
+            if out_element.item() != CHORD_END and out_element.item() != CHORD_PAD:
+                gen_chord = chordInvDic[ str( out_element.item() ) ]
+
+                chord_arr = gen_chord.split(":")
+                if len(chord_arr) == 1:
+                    out_quality = 1
+                elif len(chord_arr) == 2:
+                    chordAttrID = chordAttrDic[chord_arr[1]]
+                    out_quality = chordAttrID # 0:N, 1:maj ... 13:maj7
+
+                if tgt_emotion_quality[i][out_quality] == 1:
+                    num_right += 1
+                    
+
+    if(len(tgt_emotion) == 0):
+        return 1.0
+    
+    if(pt == 0):
+        return -1
+    
+    num_right = torch.tensor(num_right, dtype=torch.float32)
+    acc = num_right / pt
+
+    return acc
+
+def compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold):
+
+    tgt_emotion = tgt_emotion.squeeze()
+    tgt_emotion_prob = tgt_emotion_prob.squeeze()
+
+    dataset_root = "./dataset/"
+    chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+    chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+    chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+    
+    chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+    chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json")
+
+    with open(chordRootInvDicPath) as json_file:
+        chordRootInvDic = json.load(json_file)
+    with open(chordAttrDicPath) as json_file:
+        chordAttrDic = json.load(json_file)
+    with open(chordAttrInvDicPath) as json_file:
+        chordAttrInvDic = json.load(json_file)
+    with open(chordDicPath) as json_file:
+        chordDic = json.load(json_file)
+    with open(chordInvDicPath) as json_file:
+        chordInvDic = json.load(json_file)
+
+    softmax = nn.Softmax(dim=-1)
+
+    y_root = torch.argmax(softmax(y_root), dim=-1)
+    y_attr = torch.argmax(softmax(y_attr), dim=-1)
+    
+    y_root = y_root.flatten()
+    y_attr = y_attr.flatten()
+
+    tgt = tgt.flatten()
+    y = np.empty( len(tgt) )
+
+    y.fill(CHORD_PAD)
+
+    for i in range(len(tgt)):
+        if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD:
+            y[i] = CHORD_PAD
+        elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END:
+            y[i] = CHORD_END
+        else:
+            chordRoot = chordRootInvDic[str(y_root[i].item())]
+            chordAttr = chordAttrInvDic[str(y_attr[i].item())]
+            if chordRoot == "N":
+                y[i] = 0
+            else:
+                if chordAttr == "N" or chordAttr == "maj":
+                    y[i] = chordDic[chordRoot]
+                else:
+                    chord = chordRoot + ":" + chordAttr
+                    y[i] = chordDic[chord]
+
+    y = torch.from_numpy(y)
+    y = y.to(torch.long)
+    y = y.to(get_device())
+    y = y.flatten()
+
+    num_right = 0
+    tgt_emotion_quality = tgt_emotion[:, 0:14]
+    pt = 0 
+    for i, y_element in enumerate( y ):
+        all_zeros = torch.all(tgt_emotion_quality[i] == 0)
+        if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold:
+            num_right += 0
+        else:
+            pt += 1
+            if y_element.item() != CHORD_END and y_element.item() != CHORD_PAD:
+                gen_chord = chordInvDic[ str( y_element.item() ) ]
+                chord_arr = gen_chord.split(":")
+                if len(chord_arr) == 1:
+                    y_quality = 1
+                elif len(chord_arr) == 2:
+                    chordAttrID = chordAttrDic[chord_arr[1]]
+                    y_quality = chordAttrID # 0:N, 1:maj ... 13:maj7
+
+                if tgt_emotion_quality[i][y_quality] == 1:
+                    num_right += 1
+                    
+    if(len(tgt_emotion) == 0):
+        return 1.0
+    
+    if(pt == 0):
+        return -1
+    
+    num_right = torch.tensor(num_right, dtype=torch.float32)
+    acc = num_right / pt
+    return acc
+
+def compute_vevo_accuracy_root_attr(y_root, y_attr, tgt):
+
+    dataset_root = "./dataset/"
+    chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+    chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+    chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+    
+    with open(chordRootInvDicPath) as json_file:
+        chordRootInvDic = json.load(json_file)
+    with open(chordAttrInvDicPath) as json_file:
+        chordAttrInvDic = json.load(json_file)
+    with open(chordDicPath) as json_file:
+        chordDic = json.load(json_file)
+
+    softmax = nn.Softmax(dim=-1)
+
+    y_root = torch.argmax(softmax(y_root), dim=-1)
+    y_attr = torch.argmax(softmax(y_attr), dim=-1)
+    
+    y_root = y_root.flatten()
+    y_attr = y_attr.flatten()
+
+    tgt = tgt.flatten()
+
+    mask = (tgt != CHORD_PAD)
+    y = np.empty( len(tgt) )
+    y.fill(CHORD_PAD)
+
+    for i in range(len(tgt)):
+        if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD:
+            y[i] = CHORD_PAD
+        elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END:
+            y[i] = CHORD_END
+        else:
+            chordRoot = chordRootInvDic[str(y_root[i].item())]
+            chordAttr = chordAttrInvDic[str(y_attr[i].item())]
+            if chordRoot == "N":
+                y[i] = 0
+            else:
+                if chordAttr == "N" or chordAttr == "maj":
+                    y[i] = chordDic[chordRoot]
+                else:
+                    chord = chordRoot + ":" + chordAttr
+                    y[i] = chordDic[chord]
+
+    y = torch.from_numpy(y)
+    y = y.to(torch.long)
+    y = y.to(get_device())
+
+    y = y[mask]
+    tgt = tgt[mask]
+
+    # Empty
+    if(len(tgt) == 0):
+        return 1.0
+
+    num_right = (y == tgt)
+    num_right = torch.sum(num_right).type(TORCH_FLOAT)
+
+    acc = num_right / len(tgt)
+
+    return acc
+
diff --git a/dataset/vevo_meta/chord.json b/dataset/vevo_meta/chord.json
new file mode 100644
index 0000000000000000000000000000000000000000..cafa2d8e0f8e842773de8b1e9fe9c0313342b4ae
--- /dev/null
+++ b/dataset/vevo_meta/chord.json
@@ -0,0 +1 @@
+{"N": 0, "C": 1, "C:dim": 2, "C:sus4": 3, "C:min7": 4, "C:min": 5, "C:sus2": 6, "C:aug": 7, "C:dim7": 8, "C:maj6": 9, "C:hdim7": 10, "C:7": 11, "C:min6": 12, "C:maj7": 13, "C#": 14, "C#:dim": 15, "C#:sus4": 16, "C#:min7": 17, "C#:min": 18, "C#:sus2": 19, "C#:aug": 20, "C#:dim7": 21, "C#:maj6": 22, "C#:hdim7": 23, "C#:7": 24, "C#:min6": 25, "C#:maj7": 26, "D": 27, "D:dim": 28, "D:sus4": 29, "D:min7": 30, "D:min": 31, "D:sus2": 32, "D:aug": 33, "D:dim7": 34, "D:maj6": 35, "D:hdim7": 36, "D:7": 37, "D:min6": 38, "D:maj7": 39, "D#": 40, "D#:dim": 41, "D#:sus4": 42, "D#:min7": 43, "D#:min": 44, "D#:sus2": 45, "D#:aug": 46, "D#:dim7": 47, "D#:maj6": 48, "D#:hdim7": 49, "D#:7": 50, "D#:min6": 51, "D#:maj7": 52, "E": 53, "E:dim": 54, "E:sus4": 55, "E:min7": 56, "E:min": 57, "E:sus2": 58, "E:aug": 59, "E:dim7": 60, "E:maj6": 61, "E:hdim7": 62, "E:7": 63, "E:min6": 64, "E:maj7": 65, "F": 66, "F:dim": 67, "F:sus4": 68, "F:min7": 69, "F:min": 70, "F:sus2": 71, "F:aug": 72, "F:dim7": 73, "F:maj6": 74, "F:hdim7": 75, "F:7": 76, "F:min6": 77, "F:maj7": 78, "F#": 79, "F#:dim": 80, "F#:sus4": 81, "F#:min7": 82, "F#:min": 83, "F#:sus2": 84, "F#:aug": 85, "F#:dim7": 86, "F#:maj6": 87, "F#:hdim7": 88, "F#:7": 89, "F#:min6": 90, "F#:maj7": 91, "G": 92, "G:dim": 93, "G:sus4": 94, "G:min7": 95, "G:min": 96, "G:sus2": 97, "G:aug": 98, "G:dim7": 99, "G:maj6": 100, "G:hdim7": 101, "G:7": 102, "G:min6": 103, "G:maj7": 104, "G#": 105, "G#:dim": 106, "G#:sus4": 107, "G#:min7": 108, "G#:min": 109, "G#:sus2": 110, "G#:aug": 111, "G#:dim7": 112, "G#:maj6": 113, "G#:hdim7": 114, "G#:7": 115, "G#:min6": 116, "G#:maj7": 117, "A": 118, "A:dim": 119, "A:sus4": 120, "A:min7": 121, "A:min": 122, "A:sus2": 123, "A:aug": 124, "A:dim7": 125, "A:maj6": 126, "A:hdim7": 127, "A:7": 128, "A:min6": 129, "A:maj7": 130, "A#": 131, "A#:dim": 132, "A#:sus4": 133, "A#:min7": 134, "A#:min": 135, "A#:sus2": 136, "A#:aug": 137, "A#:dim7": 138, "A#:maj6": 139, "A#:hdim7": 140, "A#:7": 141, "A#:min6": 142, "A#:maj7": 143, "B": 144, "B:dim": 145, "B:sus4": 146, "B:min7": 147, "B:min": 148, "B:sus2": 149, "B:aug": 150, "B:dim7": 151, "B:maj6": 152, "B:hdim7": 153, "B:7": 154, "B:min6": 155, "B:maj7": 156}
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_attr.json b/dataset/vevo_meta/chord_attr.json
new file mode 100644
index 0000000000000000000000000000000000000000..f55cf806aa5f034812a38385f3c6cff21f8e617d
--- /dev/null
+++ b/dataset/vevo_meta/chord_attr.json
@@ -0,0 +1 @@
+{"N": 0, "maj": 1, "dim": 2, "sus4": 3, "min7": 4, "min": 5, "sus2": 6, "aug": 7, "dim7": 8, "maj6": 9, "hdim7": 10, "7": 11, "min6": 12, "maj7": 13}
diff --git a/dataset/vevo_meta/chord_attr_inv.json b/dataset/vevo_meta/chord_attr_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f355aaa2c26e0c141f64057f17a054a608d4d32
--- /dev/null
+++ b/dataset/vevo_meta/chord_attr_inv.json
@@ -0,0 +1,16 @@
+{
+    "0": "N",
+    "1": "maj",
+    "2": "dim",
+    "3": "sus4",
+    "4": "min7",
+    "5": "min",
+    "6": "sus2",
+    "7": "aug",
+    "8": "dim7",
+    "9": "maj6",
+    "10": "hdim7",
+    "11": "7",
+    "12": "min6",
+    "13": "maj7"
+}
diff --git a/dataset/vevo_meta/chord_inv.json b/dataset/vevo_meta/chord_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3dc21d8972ff10457557821481a94b16cdf5936
--- /dev/null
+++ b/dataset/vevo_meta/chord_inv.json
@@ -0,0 +1 @@
+{"0": "N", "1": "C", "2": "C:dim", "3": "C:sus4", "4": "C:min7", "5": "C:min", "6": "C:sus2", "7": "C:aug", "8": "C:dim7", "9": "C:maj6", "10": "C:hdim7", "11": "C:7", "12": "C:min6", "13": "C:maj7", "14": "C#", "15": "C#:dim", "16": "C#:sus4", "17": "C#:min7", "18": "C#:min", "19": "C#:sus2", "20": "C#:aug", "21": "C#:dim7", "22": "C#:maj6", "23": "C#:hdim7", "24": "C#:7", "25": "C#:min6", "26": "C#:maj7", "27": "D", "28": "D:dim", "29": "D:sus4", "30": "D:min7", "31": "D:min", "32": "D:sus2", "33": "D:aug", "34": "D:dim7", "35": "D:maj6", "36": "D:hdim7", "37": "D:7", "38": "D:min6", "39": "D:maj7", "40": "D#", "41": "D#:dim", "42": "D#:sus4", "43": "D#:min7", "44": "D#:min", "45": "D#:sus2", "46": "D#:aug", "47": "D#:dim7", "48": "D#:maj6", "49": "D#:hdim7", "50": "D#:7", "51": "D#:min6", "52": "D#:maj7", "53": "E", "54": "E:dim", "55": "E:sus4", "56": "E:min7", "57": "E:min", "58": "E:sus2", "59": "E:aug", "60": "E:dim7", "61": "E:maj6", "62": "E:hdim7", "63": "E:7", "64": "E:min6", "65": "E:maj7", "66": "F", "67": "F:dim", "68": "F:sus4", "69": "F:min7", "70": "F:min", "71": "F:sus2", "72": "F:aug", "73": "F:dim7", "74": "F:maj6", "75": "F:hdim7", "76": "F:7", "77": "F:min6", "78": "F:maj7", "79": "F#", "80": "F#:dim", "81": "F#:sus4", "82": "F#:min7", "83": "F#:min", "84": "F#:sus2", "85": "F#:aug", "86": "F#:dim7", "87": "F#:maj6", "88": "F#:hdim7", "89": "F#:7", "90": "F#:min6", "91": "F#:maj7", "92": "G", "93": "G:dim", "94": "G:sus4", "95": "G:min7", "96": "G:min", "97": "G:sus2", "98": "G:aug", "99": "G:dim7", "100": "G:maj6", "101": "G:hdim7", "102": "G:7", "103": "G:min6", "104": "G:maj7", "105": "G#", "106": "G#:dim", "107": "G#:sus4", "108": "G#:min7", "109": "G#:min", "110": "G#:sus2", "111": "G#:aug", "112": "G#:dim7", "113": "G#:maj6", "114": "G#:hdim7", "115": "G#:7", "116": "G#:min6", "117": "G#:maj7", "118": "A", "119": "A:dim", "120": "A:sus4", "121": "A:min7", "122": "A:min", "123": "A:sus2", "124": "A:aug", "125": "A:dim7", "126": "A:maj6", "127": "A:hdim7", "128": "A:7", "129": "A:min6", "130": "A:maj7", "131": "A#", "132": "A#:dim", "133": "A#:sus4", "134": "A#:min7", "135": "A#:min", "136": "A#:sus2", "137": "A#:aug", "138": "A#:dim7", "139": "A#:maj6", "140": "A#:hdim7", "141": "A#:7", "142": "A#:min6", "143": "A#:maj7", "144": "B", "145": "B:dim", "146": "B:sus4", "147": "B:min7", "148": "B:min", "149": "B:sus2", "150": "B:aug", "151": "B:dim7", "152": "B:maj6", "153": "B:hdim7", "154": "B:7", "155": "B:min6", "156": "B:maj7"}
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_root.json b/dataset/vevo_meta/chord_root.json
new file mode 100644
index 0000000000000000000000000000000000000000..f80daf81c00ab965bbb09b5a3424bf828b3be1f7
--- /dev/null
+++ b/dataset/vevo_meta/chord_root.json
@@ -0,0 +1 @@
+{"N": 0, "C": 1, "C#": 2, "D": 3, "D#": 4, "E": 5, "F": 6, "F#": 7, "G": 8, "G#": 9, "A": 10, "A#": 11, "B": 12} 
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_root_inv.json b/dataset/vevo_meta/chord_root_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..9febc158c8b1aba9899c06b4aec88b4a7e7b6543
--- /dev/null
+++ b/dataset/vevo_meta/chord_root_inv.json
@@ -0,0 +1,15 @@
+{
+    "0": "N",
+    "1": "C",
+    "2": "C#",
+    "3": "D",
+    "4": "D#",
+    "5": "E",
+    "6": "F",
+    "7": "F#",
+    "8": "G",
+    "9": "G#",
+    "10": "A",
+    "11": "A#",
+    "12": "B"
+}
\ No newline at end of file
diff --git a/dataset/vevo_meta/exclude.txt b/dataset/vevo_meta/exclude.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13ef0a79b8787de65a0c008583ab7dd8a4a47fd0
--- /dev/null
+++ b/dataset/vevo_meta/exclude.txt
@@ -0,0 +1 @@
+453
\ No newline at end of file
diff --git a/dataset/vevo_meta/idlist.txt b/dataset/vevo_meta/idlist.txt
new file mode 100644
index 0000000000000000000000000000000000000000..620ed37e15a6984c0284408eaa45da4b35b8ae38
--- /dev/null
+++ b/dataset/vevo_meta/idlist.txt
@@ -0,0 +1,748 @@
+001-Luis Fonsi - Despacito ft. Daddy Yankee	kJQP7kiw5Fk
+002-Mark Ronson - Uptown Funk (Official Video) ft. Bruno Mars	OPf0YbXqDm0
+003-Maroon 5 - Sugar (Official Music Video)	09R8_2nJtjg
+004-Justin Bieber - Sorry (PURPOSE  - The Movement)	fRh_vgS2dFE
+005-Katy Perry - Roar (Official)	CevxZvSJLk8
+006-OneRepublic - Counting Stars (Official Music Video)	hT_nvWreIhg
+007-Katy Perry - Dark Horse (Official) ft. Juicy J	0KSOMA3QBU0
+008-Crazy Frog - Axel F (Official Video)	k85mRPqvMbE
+009-Enrique Iglesias - Bailando ft. Descemer Bueno, Gente De Zona (Español)	NUsoVlDFqZg
+010-Taylor Swift - Shake It Off	nfWlot6h_JM
+011-J Balvin, Willy William - Mi Gente (Official Video)	wnJ6LuUFpMo
+012-Shakira - Waka Waka (This Time for Africa) (The Official 2010 FIFA World Cup™ Song)	pRpeEdMmmQ0
+013-Adele - Hello	YQHsXMglC9A
+014-Taylor Swift - Blank Space	e-ORhEE9VVg
+016-Shakira - Chantaje (Official Video) ft. Maluma	6Mgqbai3fKo
+017-Justin Bieber - Baby (Official Music Video) ft. Ludacris	kffacxfA7G4
+018-Calvin Harris - This Is What You Came For (Official Video) ft. Rihanna	kOkQ4T5WO9E
+019-Fifth Harmony - Work from Home (Official Video) ft. Ty Dolla $ign	5GL9JoH4Sws
+020-Meghan Trainor - All About That Bass	7PCkvCPvDXk
+021-Sia - Chandelier (Official Video)	2vjPBrBU-TM
+022-Eminem - Love The Way You Lie ft. Rihanna	uelHwf8o7_U
+023-Ellie Goulding - Love Me Like You Do (Official Video)	AJtDXIazrMo
+024-Shawn Mendes - Treat You Better	lY2yjAdbvdQ
+025-Justin Bieber - What Do You Mean (Official Music Video)	DK_0jXPuIr0
+026-MAGIC! - Rude (Official Video)	PIh2xe4jnpk
+027-Luis Fonsi, Demi Lovato - Échame La Culpa (Video Oficial)	TyHvyGVs42U
+028-Avicii - Wake Me Up (Official Video)	IcrbM1l_BoI
+029-LMFAO ft. Lauren Bennett, GoonRock - Party Rock Anthem (Official Video)	KQ6zr6kCPj8
+030-Imagine Dragons - Believer	7wtfhZwyrcc
+031-Becky G, Bad Bunny - Mayores (Official Video)	GMFewiplIbw
+032-John Legend - All of Me (Official Video)	450p7goxZqg
+033-Fifth Harmony - Worth It (Official Video) ft. Kid Ink	YBHQbu5rbdQ
+035-The Weeknd - Starboy ft. Daft Punk (Official Video)	34Na4j8AVgA
+036-Ariana Grande ft. Nicki Minaj - Side To Side (Official Video) ft. Nicki Minaj	SXiSVQZLje8
+037-Adele - Rolling in the Deep (Official Music Video)	rYEDA3JcQqw
+038-Rihanna - Diamonds	lWA2pjMjpBs
+039-Jennifer Lopez - On The Floor ft. Pitbull	t4H_Zoh7G5A
+041-Silentó - Watch Me (Whip_Nae Nae) (Official)	vjW8wmF5VWc
+042-Romeo Santos - Propuesta Indecente (Official Video)	QFs3PIZb3js
+043-J. Balvin - Ay Vamos (Official Video)	TapXs54Ah3E
+044-Adele - Someone Like You (Official Music Video)	hLQl3WQQoQ0
+045-Drake - Hotline Bling	uxpDa-c-4Mc
+046-Guns N' Roses - November Rain	8SbUC-UaAxE
+047-ZAYN - Dusk Till Dawn (Official Video) ft. Sia	tt2k8PGm-TI
+048-The Chainsmokers - Don't Let Me Down (Official Video) ft. Daya	Io0fBr1XBUA
+049-The Weeknd - The Hills (Official Video)	yzTuBuRdAyA
+050-Imagine Dragons - Thunder	fKopy74weus
+051-Jessie J, Ariana Grande, Nicki Minaj - Bang Bang (Official Video)	0HDdjwpPM3Y
+052-Ricky Martin - Vente Pa' Ca (Official Video) ft. Maluma	iOe6dI2JhgU
+054-CNCO - Reggaetón Lento (Bailemos)	7jpqqBX-Myw
+055-Chino y Nacho - Andas En Mi Cabeza ft. Daddy Yankee (Video Oficial)	AMTAQ-AJS4Y
+056-Justin Bieber - Love Yourself (Official Music Video)	oyEuk8j8imI
+057-DJ Khaled - I'm The One ft. Justin Bieber, Quavo, Chance the Rapper, Lil Wayne	weeI1G46q0o
+058-Eminem - Not Afraid (Official Video)	j5-yKhDd64s
+059-Calvin Harris - Summer (Official Video)	ebXbLfLACGM
+060-CAN'T STOP THE FEELING! (from DreamWorks Animation's 'TROLLS') (Official Video)	ru0K8uYEZWw
+061-Lady Gaga - Bad Romance (Official Music Video)	qrO4YZeyl0I
+062-Carlos Vives, Sebastián Yatra - Robarte un Beso (Official Video)	Mtau4v6foHA
+063-Ellie Goulding - Burn (Official Video)	CGyEd0aKWZE
+064-Calvin Harris & Disciples - How Deep Is Your Love	EgqUJOudrcM
+065-Carlos Vives, Shakira - La Bicicleta	-UV0QGLmYys
+066-Taylor Swift - Bad Blood ft. Kendrick Lamar	QcIy9NiNbmo
+067-Mike Posner - I Took A Pill In Ibiza (Seeb Remix) (Explicit)	foE1mO2yM04
+068-Sam Smith - I'm Not The Only One (Official Video)	nCkpzqqog4k
+069-Rag'n'Bone Man - Human (Official Video)	L3wKzyIN1yk
+070-Carly Rae Jepsen - Call Me Maybe	fWNaR-rxAic
+071-Shawn Mendes - Stitches (Official Video)	VbfpW0pbvaU
+072-Wisin - Escápate Conmigo (Official Video) ft. Ozuna	3X9wEwulYhk
+073-Post Malone - Congratulations ft. Quavo	SC4xMk98Pdc
+074-Nirvana - Smells Like Teen Spirit (Official Music Video)	hTWKbfoikeg
+075-Gente de Zona - La Gozadera (Official Video) ft. Marc Anthony	VMp55KH_3wo
+076-Katy Perry - Last Friday Night (T.G.I.F.) (Official Music Video)	KlyXNRrsk4A
+077-P!nk - Just Give Me A Reason ft. Nate Ruess	OpQFFLBMEPI
+078-Katy Perry - Firework (Official Music Video)	QGJuMBdaqIw
+079-Imagine Dragons - Radioactive	ktvTqknDobU
+080-Pitbull - Timber (Official Video) ft. Ke$ha	hHUbLv4ThOo
+081-French Montana - Unforgettable ft. Swae Lee	CTFtOOh47oo
+082-50 Cent - In Da Club (Official Music Video)	5qm8PH4xAss
+083-Guns N' Roses - Sweet Child O' Mine (Official Music Video)	1w7OgIMMRc4
+084-One Direction - What Makes You Beautiful (Official Video)	QJO3ROT-A4E
+085-Ariana Grande ft. Iggy Azalea - Problem (Official Video)	iS1g8G_njx8
+086-Sam Smith - Too Good At Goodbyes (Official Video)	J_ub7Etch2U
+087-AronChupa - I'm an Albatraoz _ OFFICIAL VIDEO	Bznxx12Ptl0
+088-Taylor Swift - Look What You Made Me Do	3tmd-ClpJxA
+089-Chris Jedi - Ahora Dice (Official Video) ft. J. Balvin, Ozuna, Arcángel	c73Cu3TQnlg
+090-Joey Montana - Picky	RqpKDkVzlqU
+091-Eminem - Without Me (Official Music Video)	YVkUvmDQ3HY
+092-Prince Royce - Darte un Beso	bdOXnTbyk0g
+093-Taylor Swift - You Belong With Me	VuNIsY6JdUw
+094-Eminem - Rap God (Explicit)	XbGs_qK2PQA
+095-Don Omar - Danza Kuduro ft. Lucenzo	7zp1TbLFPp8
+096-Maluma - El Perdedor (Official Video)	PJniSb91tvo
+097-Rihanna - Work (Explicit) ft. Drake	HL1UzIK-flA
+098-Ricky Martin - La Mordidita (Official Video) ft. Yotuel	lBztnahrOFw
+099-Beyoncé - Halo	bnVUHWCynig
+100-The Weeknd - Can't Feel My Face (Official Video)	KEI4qSrkPAs
+101-Shakira - La La La (Brazil 2014) ft. Carlinhos Brown	7-7knsP2n5w
+102-Sia - Elastic Heart feat. Shia LaBeouf & Maddie Ziegler (Official Video)	KWZGAExj-es
+103-Katy Perry - Bon Appétit (Official) ft. Migos	dPI-mRFEIH0
+104-The Cranberries - Zombie (Official Music Video)	6Ejga4kJUts
+105-Shakira - Can't Remember to Forget You (Official Video) ft. Rihanna	o3mP3mJDL2k
+106-Daddy Yankee - Limbo (Video Oficial)	6BTjG-dhf5s
+107-Whitney Houston - I Will Always Love You (Official 4K Video)	3JWTaaS7LdU
+108-Miley Cyrus - Wrecking Ball (Official Video)	My2FRPA3Gf8
+109-Chris Brown - Loyal (Official Video) ft. Lil Wayne, Tyga	JXRN_LkCa_o
+110-Pitbull - Rain Over Me ft. Marc Anthony	SmM0653YvXU
+111-Enrique Iglesias - El Perdedor (Pop) ft. Marco Antonio Solís	tLcfAnN2QgY
+112-J Balvin - 6 AM ft. Farruko (Official Video)	yUV9JwiQLog
+113-System Of A Down - Chop Suey! (Official HD Video)	CSvFpBOe8eY
+114-Naughty Boy - La la la ft. Sam Smith (Official Video)	3O1_3zBUKM8
+115-Rick Astley - Never Gonna Give You Up (Official Music Video)	dQw4w9WgXcQ
+116-Ariana Grande - Break Free ft. Zedd	L8eRzOYhLuw
+117-Sam Smith - Stay With Me (Official Video)	pB-5XG-DbAA
+118-Michael Jackson - Billie Jean (Official Video)	Zi_XLOBDo_Y
+119-Nelly - Dilemma (Official Music Video) ft. Kelly Rowland	8WYHDfJDPDc
+120-ZAYN - PILLOWTALK (Official Music Video)	C_3d6GntKbk
+121-DJ Snake, Lil Jon - Turn Down for What	HMUDVMiITOU
+122-Katy Perry - Hot N Cold (Official)	kTHNpusq654
+123-Iggy Azalea - Fancy ft. Charli XCX (Official Music Video)	O-zpOMYRi0w
+124-Bon Jovi - It's My Life (Official Music Video)	vx2u5uUu3DE
+125-Chino & Nacho - Me Voy Enamorando ft. Farruko (Remix) (Official Music Video)	0yr75-gxVtM
+126-Marc Anthony - Vivir Mi Vida (Official Video)	YXnjy5YlDwk
+127-Justin Bieber - Never Say Never (Official Music Video) ft. Jaden Smith	_Z5-P9v3F8w
+128-Shawn Mendes - There's Nothing Holdin' Me Back	dT2owtxkU8k
+129-Enrique Iglesias - DUELE EL CORAZON ft. Wisin	xFutjZEBTXs
+130-DJ Khaled - Wild Thoughts (Official Video) ft. Rihanna, Bryson Tiller	fyaI4-5849w
+131-Maluma - Sin Contrato (Official Video)	9xByMBYDRmY
+132-Nicki Minaj - Anaconda	LDZX4ooRsWs
+133-Maluma - Borro Cassette (Official Video)	Xk0wdDTTPA0
+134-AC_DC - Thunderstruck (Official Video)	v2AC41dglnM
+135-Romeo Santos - Eres Mía	8iPcqtHoR3U
+136-Backstreet Boys - I Want It That Way (Official HD Video)	4fndeDfaWCg
+137-Shakira - Hips Don't Lie (Official 4K Video) ft. Wyclef Jean	DUT5rEU6pqM
+138-Camila Cabello - Havana ft. Young Thug	BQ0mxQXmLsk
+139-Rihanna - We Found Love ft. Calvin Harris	tg00YEETFzg
+140-J Balvin - Safari ft. Pharrell Williams, BIA, Sky (Official Video)	JWESLtAKKlU
+141-Rihanna - Stay ft. Mikky Ekko	JF8BRvqGCNs
+142-Maluma - Cuatro Babys (Official Video) ft. Trap Capos, Noriel, Bryant Myers, Juhn	OXq-JP8w5H4
+143-Cyndi Lauper - Girls Just Want To Have Fun (Official Video)	PIb6AZdTr-A
+144-Evanescence - Bring Me To Life (Official Music Video)	3YxaaGgTQYM
+145-Justin Bieber - Beauty And A Beat ft. Nicki Minaj (Official Music Video)	Ys7-6_t7OEQ
+146-One Direction - Drag Me Down (Official Video)	Jwgf3wmiA04
+147-Auli'i Cravalho - How Far I'll Go (from Moana_Official Video)	cPAbx5kgCJo
+148-Aqua - Barbie Girl (Official Music Video)	ZyhrYis509A
+149-Dr. Dre ft. Snoop Dogg - Still D.R.E. (Official Video)	_CL6n0FJZpk
+150-Justin Timberlake - Mirrors (Official Video)	uuZE_IRwLNI
+151-Katy Perry - Wide Awake (Official Video)	k0BWlvnBmIE
+152-J Balvin - Si Tu Novio Te Deja Sola ft. Bad Bunny (Official Video)	Km4BayZykwE
+153-Maroon 5 - One More Night (Official Music Video)	fwK7ggA3-bU
+154-Imagine Dragons - Demons (Official Video)	mWRsgZuwf_8
+155-Ariana Grande - Focus	lf_wVfwpfp8
+156-Europe - The Final Countdown (Official Video)	9jK-NcRmVcw
+157-Lady Gaga - Poker Face (Official Music Video)	bESGLojNYSo
+158-Post Malone - rockstar ft. 21 Savage	UceaB4D0jpo
+159-Ayo & Teo - Rolex (Official Video)	lwk5OUII9Vc
+160-Thalia - Desde Esa Noche (Premio Lo Nuestro 2016) ft. Maluma	6C_s56iscpQ
+161-Cali Y El Dandee - Por Fin Te Encontré ft. Juan Magan, Sebastian Yatra (Video Oficiel)	_kxz7WX4mLU
+162-One Direction - Story of My Life	W-TE_Ys4iwM
+163-Miley Cyrus - We Can't Stop (Official Video)	LrUvu1mlWco
+164-Mike WiLL Made-It - 23 ft. Miley Cyrus, Wiz Khalifa, Juicy J (Official Music Video)	bbEoRnaOIbs
+165-Scorpions - Wind Of Change (Official Music Video)	n4RjJKxsamQ
+166-Nicki Minaj - Super Bass	4JipHEz53sU
+167-Karol G, Bad Bunny - Ahora Me Llama (Official Video)	4NNRy_Wz16k
+168-Tove Lo - Habits (Stay High) - Hippie Sabotage Remix	SYM-RJwSGQ8
+169-Harry Styles - Sign of the Times (Official Video)	qN4ooNx77u0
+170-The Police - Every Breath You Take (Official Video)	OMOGaugKpzs
+171-Avicii - Waiting For Love	cHHLHGNpCSA
+172-Ariana Grande - Into You (Official Video)	1ekZEVeXwek
+173-will.i.am - Scream & Shout ft. Britney Spears (Official Music Video)	kYtGl1dX5qI
+174-Rihanna - What's My Name (Official Music Video) ft. Drake	U0CGsw6h60k
+175-Katy Perry - Part Of Me (Official)	uuwfgXD8qV8
+176-Pitbull - Give Me Everything ft. Ne-Yo, Afrojack, Nayer	EPo5wWmKEaI
+177-Audioslave - Like a Stone (Official Video)	7QU1nvuxaMA
+178-HA-ASH - Perdón, Perdón (Primera Fila - Hecho Realidad [En Vivo])	_wL3Pc-EmjA
+179-Katy Perry - The One That Got Away (Official Music Video)	Ahha3Cqe_fk
+180-Nacho, Yandel, Bad Bunny - Báilame (Remix)	T7VewKI44rQ
+181-Sean Kingston - Beautiful Girls	MrTz5xjmso4
+182-LMFAO - Sexy and I Know It (Official Video)	wyx6JDQCslE
+183-Eminem - When I'm Gone (Official Music Video)	1wYNFfgrXTI
+184-Michael Jackson - They Don’t Care About Us (Brazil Version) (Official Video)	QNJL6nfu__Q
+185-Lorde - Royals (US Version)	nlcIKh6sBtc
+186-R. City - Locked Away ft. Adam Levine	6GUm5g8SG4o
+187-John Newman - Love Me Again	CfihYWRWRTQ
+188-No Doubt - Don't Speak (Official 4K Music Video)	TR3Vdo5etCQ
+189-Bon Jovi - Livin' On A Prayer (Official Music Video)	lDK9QqIzhwk
+190-Stromae - Papaoutai	oiKj0Z_Xnjc
+191-Rae Sremmurd - Black Beatles ft. Gucci Mane (Official Video)	b8m9zhNAgKs
+192-Little Mix - Black Magic (Official Video)	MkElfR_NPBI
+193-Shakira - Perro Fiel (Official Video) ft. Nicky Jam	SHq2qrFUlGY
+194-Wisin - Adrenalina (Official Video) ft. Jennifer Lopez, Ricky Martin	ME2Hufquz0k
+195-Beyoncé - Single Ladies (Put a Ring on It) (Video Version)	4m1EFMoRFvY
+196-Bonnie Tyler - Total Eclipse of the Heart (Video)	lcOxhH8N3Bo
+197-Rihanna - Only Girl (In The World) (Official Music Video)	pa14VNsdSYM
+198-Miley Cyrus - Party In The U.S.A. (Official Video)	M11SvDtPBhA
+199-Rae Sremmurd - No Type (Official Video)	wzMrK-aGCug
+200-J. Balvin - Ginza (Official Video)	zZjSX01P5dE
+201-Justin Bieber - Boyfriend (Official Music Video)	4GuqB1BQVr4
+202-Akon - Smack That (Official Music Video) ft. Eminem	bKDdT_nyP54
+203-Rihanna - Man Down	sEhy-RXkNo0
+204-Indila - Dernière Danse (Clip Officiel)	K5KAc5CoCuk
+205-Hoobastank - The Reason (Official Music Video)	fV4DiAyExN0
+206-Kendrick Lamar - HUMBLE.	tvTRZJ-4EyI
+207-Foster The People - Pumped Up Kicks (Official Video)	SDTZ7iX4vTQ
+208-Khalid - Young Dumb & Broke (Official Video)	IPfJnp1guPc
+209-Michael Jackson - Thriller (Official Video)	sOnqjkJTMaA
+210-Pitbull - International Love (Official Video) ft. Chris Brown	CdXesX6mYUE
+211-Calvin Harris - I Need Your Love (Official Video) ft. Ellie Goulding	AtKZKl7Bgu0
+212-Eminem ft. Rihanna - The Monster (Explicit) [Official Video]	EHkozMIXZ8w
+213-Evanescence - My Immortal (Official Music Video)	5anLPw0Efmo
+214-Swedish House Mafia ft. John Martin - Don't You Worry Child (Official Video)	1y6smkh6c-0
+215-George Michael - Careless Whisper (Official Video)	izGwDsrQ1eQ
+216-Jennifer Lopez - Ain't Your Mama (Official Video)	Pgmx7z49OEk
+217-Shakira - Me Enamoré (Official Video)	sPTn0QEhxds
+218-We Are One (Ole Ola) [The Official 2014 FIFA World Cup Song] (Olodum Mix)	TGtWWb9emYI
+219-AC_DC - Back In Black (Official Video)	pAgnJDJN4VA
+220-Avicii - The Nights	UtF6Jej8yb4
+221-La Adictiva Banda San José de Mesillas - Después de Ti, ¿Quién (Video Oficial)	YWu9mB6X9Oc
+222-Kygo - Firestone ft. Conrad Sewell (Official Video)	9Sc-ir2UwGU
+223-Taylor Swift - Wildest Dreams	IdneKLhsWOQ
+224-Bon Jovi - Always (Official Music Video)	9BMwcO6_hyA
+225-Maroon 5 - Animals (Official Music Video)	qpgTC9MDx1o
+226-Farruko - Chillax ft. Ky-Mani Marley (Official Video)	7fEQmJ98x_Y
+227-Michael Jackson - Beat It (Official Video)	oRdxUFDoQe0
+228-Bobby Shmurda - Hot N_gga (Official Music Video)	vJwKKKd2ZYE
+229-Adele - Send My Love (To Your New Lover)	fk4BbF7B29w
+230-Robin Thicke - Blurred Lines ft. T.I., Pharrell (Official Music Video)	yyDUC1LUXSU
+231-Calvin Harris - Blame ft. John Newman	6ACl8s_tBzE
+232-Jessie J - Price Tag ft. B.o.B	qMxX-QOV9tI
+233-Katy Perry - This Is How We Do (Official)	7RMQksXpQSk
+234-Don Omar - Taboo	lRWqYR3e7xE
+235-Romeo Santos - Yo También (Official Video) ft. Marc Anthony	QBaIMZ8QjcU
+236-Alvaro Soler - Sofia (Official Music Video)	qaZ0oAh4evU
+237-Rihanna - Umbrella (Orange Version) (Official Music Video) ft. JAY-Z	CvBfHwUxHIk
+238-Farruko, Bad Bunny, Rvssian - Krippy Kush (Official Video)	j1_JW7An2l0
+239-Selena Gomez - The Heart Wants What It Wants (Official Video)	ij_0p_6qTss
+240-Enrique Iglesias, Juan Luis Guerra - Cuando Me Enamoro (Official Music Video)	4DO8GsIYfhQ
+241-Zara Larsson - Lush Life	tD4HCZe-tew
+242-The Verve - Bitter Sweet Symphony (Official Music Video)	1lyu1KKwC74
+243-The Black Eyed Peas - Where Is The Love (Official Music Video)	WpYeekQkAdc
+244-One Direction - Best Song Ever	o_v9MY_FMcw
+245-Maroon 5 - Moves Like Jagger ft. Christina Aguilera (Official Music Video)	iEPTlhBmwRg
+246-Sia - The Greatest	GKSRyLdjsPA
+247-Akon - Lonely (Official Music Video)	6EEW-9NDM5k
+248-Ariana Grande, The Weeknd - Love Me Harder (Official Video)	g5qU7p7yOY8
+249-50 Cent - Candy Shop (Official Music Video) ft. Olivia	SRcnnId15BA
+250-Selena Gomez - Come & Get It	n-D1EB74Ckg
+251-Meghan Trainor - Like I'm Gonna Lose You (Official Video) ft. John Legend	2-MBfn8XjIU
+252-Jonas Blue - Mama ft. William Singe (Official Video)	qPTfXwPf_HM
+253-One Direction - One Thing	Y1xs_xPb46M
+254-Mariah Carey - All I Want For Christmas Is You (Official Video)	yXQViqx6GMY
+255-Jonas Blue - Perfect Strangers ft. JP Cooper (Official Video)	Ey_hgKCCYU4
+256-Maroon 5 - Payphone ft. Wiz Khalifa (Explicit) (Official Music Video)	KRaWnd3LJfs
+257-Simone & Simaria - Loka (Ao Vivo) ft. Anitta	UrT0zCmsN8c
+258-Future - Low Life (Official Music Video) ft. The Weeknd	K_9tX4eHztY
+259-Silvestre Dangond, Nicky Jam - Cásate Conmigo (Official Video)	cpN78ZjnCZY
+261-One Direction - Live While We're Young	AbPED9bisSc
+262-Lil Wayne - Mirror ft. Bruno Mars (Official Music Video)	OZLUa8JUR18
+263-Katy Perry - Chained To The Rhythm (Official) ft. Skip Marley	Um7pMggPnug
+264-Justin Bieber - One Time (Official Music Video)	CHVhwcOg6y8
+265-Dillon Francis, DJ Snake - Get Low (Official Music Video)	12CeaxLiMgE
+266-The Weeknd - Earned It (from Fifty Shades Of Grey) (Official Video - Explicit)	waU75jdUnYw
+267-Taylor Swift - Style	-CmadmM5cOk
+268-Adele - Set Fire To The Rain (Live at The Royal Albert Hall)	Ri7-vnrJD3k
+269-Wham! - Last Christmas (Official Video)	E8gmARGvPlI
+270-3 Doors Down - Here Without You (Official Music Video)	kPBzTxZQG5Q
+271-Shakira - Try Everything (Official Video)	c6rP-YP4c5I
+272-Guns N' Roses - Paradise City	Rbm6GXllBiw
+273-MC Hammer - U Can't Touch This (Official Music Video)	otCpCn0l4Wo
+274-Taylor Swift - We Are Never Ever Getting Back Together	WA4iX5D9Z64
+275-The Black Eyed Peas - Pump It (Official Music Video)	ZaI2IlHwmgQ
+276-Sia - Cheap Thrills (Performance Edit)	31crA53Dgu0
+277-Nelly Furtado - Say It Right (Official Music Video)	6JnGBs88sL0
+278-Britney Spears - ...Baby One More Time (Official Video)	C-u5WLJ9Yk4
+279-Banda Los Recoditos - Mi Último Deseo (Video Oficial)	cVlAmP-KDT4
+280-Jessie J - Flashlight (from Pitch Perfect 2) (Official Video)	DzwkcbTQ7ZE
+282-Demi Lovato - Heart Attack (Official Video)	AByfaYcOm4A
+283-Meghan Trainor - Me Too	qDRORgoZxZU
+284-Guns N' Roses - Don't Cry	zRIbf6JqkNc
+285-Bastille - Pompeii (Official Music Video)	F90Cw4l-8NY
+286-Akon - Right Now (Na Na Na) (Official Video)	vIaH35-MLsk
+287-Katy Perry - Swish Swish (Official) ft. Nicki Minaj	iGk5fR-t5AU
+288-Shakira - La Tortura (Official HD Video) ft. Alejandro Sanz	Dsp_8Lm1eSk
+289-ZAYN, Taylor Swift - I Don’t Wanna Live Forever (Fifty Shades Darker)	7F37r50VUTQ
+290-Ariana Grande - Dangerous Woman	9WbCfHutDSE
+291-Pitbull - Feel This Moment (Official Video) ft. Christina Aguilera	5jlI4uzZGjU
+292-Selena Gomez & The Scene - Love You Like A Love Song	EgT_us6AsDg
+293-Wisin, Carlos Vives - Nota de Amor (Official Video) ft. Daddy Yankee	wZRWpr1G1Qw
+294-Beyoncé - Drunk in Love (Explicit) ft. JAY Z	p1JPKLa-Ofc
+295-Romeo Santos - La Diabla_Mi Santa ft. Tomatito	Hz9lhqxl_gQ
+296-Maroon 5 - She Will Be Loved (Official Music Video)	nIjVuRTm-dc
+297-The Black Eyed Peas - My Humps	iEe_eraFWWs
+298-Duke Dumont - Ocean Drive (Official Music Video)	KDxJlW6cxRk
+299-Iggy Azalea - Black Widow ft. Rita Ora (Official Music Video)	u3u22OYqFGo
+300-Justin Bieber - Company (Official Music Video)	gdx7gN1UyX0
+301-Anna Kendrick - Cups (Pitch Perfect’s “When I’m Gone”) [Official Video]	cmSbXsFE3l8
+302-J Balvin - Bobo (Official Video)	0GvLP2C2w9U
+303-Alicia Keys - No One (Official Music Video)	rywUS-ohqeE
+304-Adele - When We Were Young (Live at The Church Studios)	DDWKuo3gXMQ
+305-Waka Waka (Esto es Africa) (Cancion Oficial de la Copa Mundial de la FIFA� Sudafrica 2010)	dzsuE5ugxf4
+306-Katy Perry - California Gurls (Official Music Video) ft. Snoop Dogg	F57P9C4SAW4
+307-The Pussycat Dolls - Buttons (Official Music Video) ft. Snoop Dogg	VCLxJd1d84s
+308-Taylor Swift - Love Story	8xg3vE8Ie_E
+309-Demi Lovato - Let It Go (from 'Frozen') (Official Video)	kHue-HaXXzg
+311-Calvin Harris - Outside (Official Video) ft. Ellie Goulding	J9NQFACZYEU
+312-Shakira - Whenever, Wherever (Official HD Video)	weRHyjj34ZE
+313-Maroon 5 - What Lovers Do ft. SZA (Official Music Video)	5Wiio4KoGe8
+314-Wisin & Yandel - Follow The Leader ft. Jennifer Lopez	Xmap94TcDNs
+315-Enrique Iglesias - Loco ft. Romeo Santos	RSyUWjftHrs
+316-Toni Braxton - Un-Break My Heart (Official HD Video)	p2Rch6WvPJE
+317-Daddy Yankee - Sígueme y Te Sigo (Video Oficial)	EfF9EE6ZR5E
+318-Ke$ha - TiK ToK (Official HD Video)	iP6XpLQM2Cs
+319-Katy Perry - E.T. ft. Kanye West (Official Music Video)	t5Sd5c4o9UM
+320-Meghan Trainor - NO	cMTAUr3Nm6I
+321-Katy Perry - Unconditionally (Official)	XjwZAa2EjKA
+322-Taylor Swift - 22	AgFeZr5ptV8
+323-Roxette - It Must Have Been Love (Official Music Video)	k2C5TjS2sh4
+324-Capital Cities - Safe And Sound (Official Music Video)	47dtFZ8CFo8
+325-Shakira - Loca (Spanish Version) ft. El Cata	XAhTt60W7qo
+326-Wisin & Yandel - Algo Me Gusta De Ti ft. Chris Brown, T-Pain	3rgwIp6D3ow
+327-Rihanna - Rude Boy (Official Music Video)	e82VE8UtW8A
+328-Beyoncé - Crazy In Love ft. JAY Z	ViwtNLUqkMY
+329-Rihanna - Don't Stop The Music	yd8jh9QYfEs
+330-Meghan Trainor - Dear Future Husband (Official Video)	ShlW5plD_40
+331-Eminem - Mockingbird (Official Music Video)	S9bCLPwzSC0
+332-A Great Big World, Christina Aguilera - Say Something	-2U0Ivkn2Ds
+333-Nelly - Just A Dream (Official Music Video)	N6O2ncUKvlg
+334-Meghan Trainor - Lips Are Movin (Official Music Video)	qDc_5zpBj7s
+335-Avril Lavigne - Girlfriend	Bg59q4puhmg
+336-Rihanna - Take A Bow (Official Music Video)	J3UjJ4wKLkg
+337-The Black Eyed Peas - I Gotta Feeling (Official Music Video)	uSD4vsh1zDA
+338-Rihanna - Where Have You Been	HBxt_v0WF6Y
+339-Avicii - Levels	_ovdm2yX4MA
+340-Eminem - No Love (Explicit Version) ft. Lil Wayne	KV2ssT8lzj8
+341-Rachel Platten - Fight Song (Official Video)	xo1VInw-SKc
+342-LMFAO - Sorry For Party Rocking	SkTt9k4Y-a8
+343-Abba - Dancing Queen (Official Music Video Remastered)	xFrGuyw1V8s
+344-The Black Eyed Peas - The Time (Dirty Bit) (Official Music Video)	JwQZQygg3Lk
+345-Plan B - Mi Vecinita	SB8-YY2DyHI
+346-One Direction - You & I	_kqQDCxRCzM
+347-Jennifer Lopez - Dance Again (Official Video) ft. Pitbull	bjgFH01k0gU
+348-Britney Spears - Toxic (Official HD Video)	LOZuxwVk7TU
+349-Chris Brown - With You (Official Video)	nmjdaBaZe8Y
+351-G-Eazy x Bebe Rexha - Me, Myself & I	bSfpSOBD30U
+352-Chris Brown - Look at Me Now (Official Video) ft. Lil Wayne, Busta Rhymes	8gyLR4NfMiI
+353-Beyoncé - If I Were A Boy	AWpsOqh8q0M
+354-Beyoncé - Run the World (Girls) (Official Video)	VBmMU_iwe6U
+355-Rich Gang ft. Young Thug, Rich Homie Quan - Lifestyle (Official Video)	nGt_JGHYEO4
+356-One Direction - Kiss You (Official)	T4cdfRohhcg
+357-Pia Mia - Do It Again ft. Chris Brown, Tyga (Official Music Video)	cNw8A5pwbVI
+358-HA-ASH - Te Dejo En Libertad (En Vivo)	ZxvI1epOAWE
+359-Maître Gims - Bella (Clip officiel)	rMltoD1jCGI
+360-Selena Gomez - Kill Em With Kindness	HHP5MKgK0o8
+361-AC_DC - Highway to Hell (Live At River Plate, December 2009)	gEPmA3USJdI
+362-Imagine Dragons - It's Time	sENM2wA_FTg
+363-Lana Del Rey - Born To Die (Official Music Video)	Bag1gUxuU0g
+364-Sean Kingston, Justin Bieber - Eenie Meenie (Video Version)	prmmCg5bKxA
+365-Alejandro Fernández - Hoy Tengo Ganas De Ti ft. Christina Aguilera (Video Oficial)	Z81hsLIY1sQ
+366-Prince Royce, Shakira - Deja vu (Official Video)	XEvKn-QgAY0
+367-Camila - Mientes (Video)	xftFxCYQTdk
+368-Dr. Dre ft. Snoop Dogg, Kurupt, Nate Dogg - The Next Episode (Official Video)	QZXc39hT8t4
+369-Soulja Boy Tell'em - Crank That (Soulja Boy) (Official Music Video)	8UFIYGkROII
+370-The Script - Hall of Fame (Official Video) ft. will.i.am	mk48xRzuNvA
+371-Zara Larsson, MNEK - Never Forget You	GTyN-DB_v5M
+372-Mariah Carey - We Belong Together (Official Music Video)	0habxsuXW4g
+374-Mr. Probz - Waves (Robin Schulz Remix Radio Edit)	pUjE9H8QlA4
+375-P!nk - Try (Official Video)	yTCDVfMz15M
+376-Justin Bieber - I'll Show You (Official Music Video)	PfGaX8G0f2E
+377-One Direction - Perfect (Official Video)	Ho32Oh6b4jc
+378-Selena Gomez - Good For You	1TsVjvEkc4s
+379-Demi Lovato - Sorry Not Sorry (Official Video)	-MsvER1dpjM
+380-Plan B - Fanatica Sensual Official Video	QvypZSdjO8M
+381-Eminem - Stan (Long Version) ft. Dido	gOMhN-hfMtY
+382-Nicki Minaj - Only ft. Drake, Lil Wayne, Chris Brown	zXtsGAkyeIo
+383-Foo Fighters - The Pretender	SBjQ9tuuTJQ
+384-Taylor Swift - I Knew You Were Trouble	vNoKguSdy4Y
+385-Drake - Started From The Bottom (Explicit)	RubBzkZzpUA
+386-Rihanna, Kanye West, Paul McCartney - FourFiveSeconds	kt0g4dWxEBo
+387-Miley Cyrus - Malibu (Official Video)	8j9zMok6two
+388-Lady Antebellum - Need You Now	eM213aMKTHg
+389-Beyoncé - Love On Top (Official Video)	Ob7vObnFUJc
+390-Guns N' Roses - Welcome To The Jungle	o1tj2zJ2Wvg
+391-Tyga - Hookah (Official Music Video) ft. Young Thug	b-J95fYuVz4
+392-Justin Bieber - As Long As You Love Me ft. Big Sean (Official Music Video)	R4em3LKQCAQ
+393-J Balvin - Tranquila (Official Video)	HWyEEj2pSt0
+394-One Direction - Night Changes	syFZfO_wfMQ
+395-Farruko - Passion Whine ft. Sean Paul (Official Video)	MNmc_XJp5rI
+396-Lady Gaga - Alejandro	niqrrmev4mA
+397-Justin Bieber - Somebody To Love Remix ft. Usher (Official Music Video)	SOI4OF7iIr4
+398-J Balvin - Sigo Extrañándote (Official Video)	nZ0zbsZOdwg
+399-Avril Lavigne - When You're Gone (Official Video)	0G3_kG5FFfQ
+400-Desiigner - Panda (Official Music Video)	E5ONTXHS2mM
+402-The Calling - Wherever You Will Go (Official Video)	iAP9AF6DCu4
+403-Nego do Borel - Você Partiu Meu Coração ft. Anitta, Wesley Safadão (Video Oficial)	Xp-dKdSUuLk
+404-Louis Tomlinson - Back to You (Official Video) ft. Bebe Rexha, Digital Farm Animals	-HjpL-Ns6_A
+405-Maroon 5 - Maps (Explicit) (Official Music Video)	NmugSMBh_iI
+406-The Weeknd - Often (NSFW) (Official Video)	JPIhUaONiLU
+407-Nicki Minaj - Right By My Side (Explicit) ft. Chris Brown	he3DJLXbebI
+408-Cali Y El Dandee - Yo Te Esperaré	_KSyWS8UgA4
+409-Lana Del Rey - Young and Beautiful (Official Music Video)	o_1aF54DO60
+411-The Killers - Mr. Brightside (Official Music Video)	gGdGFtwCNBE
+412-One Direction - One Way Or Another (Teenage Kicks)	36mCEZzzQ3o
+413-Lil Wayne - Lollipop ft. Static (Official Music Video)	2IH8tNQAzSs
+415-Sam Smith - Lay Me Down (Official Video)	HaMq2nn5ac0
+416-Kungs vs Cookin’ on 3 Burners - This Girl (Official Music Video)	2Y6Nne8RvaA
+417-Becky G - Shower	50-_oTkmF5I
+418-Jennifer Lopez - Papi (Official Video)	6XbIuSLaCnk
+419-Selena Gomez - Same Old Love	9h30Bx4Klxg
+420-Justin Bieber - Mistletoe (Official Music Video)	LUjn3RpkcKY
+421-Dr. Dre ft. Eminem, Skylar Grey - I Need A Doctor (Explicit) [Official Video]	VA770wpLX-Q
+422-Akon - Don't Matter (Official Music Video)	JWA5hJl4Dv0
+423-Kelly Clarkson - Because Of You (VIDEO)	Ra-Om7UMSJc
+424-DNCE - Cake By The Ocean	vWaRiD5ym74
+425-Fifth Harmony - All In My Head (Flex) (Official Video) ft. Fetty Wap	jsbeemdD2rQ
+426-Timbaland - Apologize ft. OneRepublic	ZSM3w1v-A_Y
+427-Beyoncé, Shakira - Beautiful Liar	QrOe2h9RtWI
+428-Demi Lovato - Give Your Heart a Break (Official Video)	1zfzka5VwRc
+429-Ariana Grande - The Way ft. Mac Miller	_sV0S8qWSy0
+430-Beyoncé - Irreplaceable	2EwViQxSJJQ
+431-Logic - 1-800-273-8255 ft. Alessia Cara, Khalid (Official Video)	Kb24RrHIbFk
+432-50 Cent - 21 Questions (Official Music Video) ft. Nate Dogg	cDMhlvbOFaM
+433-Enrique Iglesias - Hero (Official Music Video)	koJlIGDImiU
+434-Miley Cyrus - The Climb	NG2zyeVRcbs
+435-Owl City - Fireflies (Official Music Video)	psuRGfAaju4
+436-Thalia - Equivocada (Live Version)	QPeNUfc8hGk
+437-Avril Lavigne - Wish You Were Here (Video)	VT1-sitWRtY
+439-Eminem - Beautiful (Official Music Video)	lgT1AidzRWM
+440-Christina Aguilera, Lil' Kim, Mya, Pink - Lady Marmalade (Official Music Video)	RQa7SvVCdZk
+441-Rihanna - Pour It Up (Explicit)	ehcVomMexkY
+442-Rihanna - Unfaithful (Official Music Video)	rp4UwPZfRis
+443-J. Balvin, Jowell & Randy - Bonita (Official Video)	SqpvOqRieYY
+444-Selena Gomez - Hands To Myself	FMlcn-_jpWY
+445-One Direction - History (Official Video)	yjmp8CoZBIo
+446-Calvin Harris - My Way (Official Video)	b4Bj7Zb-YD4
+447-Nicki Minaj - Starships (Explicit)	SeIJmciN8mo
+448-Reik - Creo en Ti	snFhcHHdzT0
+449-Kings Of Leon - Sex on Fire (Official Video)	RF0HhrwIwp0
+450-Justin Bieber - Love Me (Official Music Video)	qdDVtFvJwUc
+451-The Black Eyed Peas - Boom Boom Pow (Official Music Video)	4m48GqaOz90
+452-Justin Timberlake - Cry Me A River (Official Video)	DksSPZTZES0
+453-Lady Gaga - Telephone ft. Beyoncé (Official Music Video)	EVBsypHzF3U
+454-Eminem - Like Toy Soldiers (Official Video)	lexLAjh8fPA
+455-Naughty Boy ft. Beyoncé, Arrow Benjamin - Runnin' (Lose It All) [Official Video]	eJSik6ejkr0
+456-Lil Wayne - Love Me ft. Drake, Future (Explicit) (Official Music Video)	KY44zvhWhp4
+457-Kelly Clarkson - Stronger (What Doesn't Kill You) [Official Video]	Xn676-fLq7I
+458-Descendants Cast - Rotten to the Core (from Descendants) (Official Video)	zGlLe1w3DJM
+459-P!nk - So What (Official Video)	FJfFZqTlWrQ
+460-Timbaland - The Way I Are (Official Music Video) ft. Keri Hilson, D.O.E., Sebastian	U5rLz5AZBIA
+461-Vanilla Ice - Ice Ice Baby (Official Music Video)	rog8ou-ZepE
+462-Bryson Tiller - Don't (Explicit Version)	d7cVLE4SaN0
+463-Michael Jackson - The Way You Make Me Feel (Official Video)	HzZ_urpj4As
+464-Machine Gun Kelly, Camila Cabello - Bad Things (Official Music Video)	QpbQ4I3Eidg
+465-Eminem - You Don't Know (Official Music Video) ft. 50 Cent, Cashis, Lloyd Banks	ngH0fkiNo-g
+467-Kanye West - Stronger	PsO6ZnUZI0g
+468-Bloodhound Gang - The Bad Touch (Official Video)	xat1GVnl8-k
+469-What Goes Around...Comes Around (Official Video)	TOrnUquxtwA
+470-Reyli Barba - Amor del Bueno (Video)	FUinZg5MC5U
+471-Owl City & Carly Rae Jepsen - Good Time (Official Video)	H7HmzwI67ec
+472-Plan B - Candy	9FWgcBfs5A0
+473-The Black Eyed Peas - Meet Me Halfway (Official Music Video)	I7HahVwYpwo
+474-Lady Gaga - Judas (Official Music Video)	wagn8Wrmzuc
+475-Justin Bieber - One Less Lonely Girl (Official Music Video)	LXUSaVw3Mvk
+476-Lady Gaga - Applause (Official Music Video)	pco91kroVgQ
+477-Rihanna - Rehab (Official Music Video) ft. Justin Timberlake	rJYcmq__nDM
+478-Ricardo Montaner - La Gloria de Dios (Video Oficial) ft. Evaluna Montaner	LRsgqFu5c1o
+479-Maître Gims - Est-ce que tu m'aimes  (Clip officiel)	6TpyRE_juyA
+480-Michael Jackson - Bad (Shortened Version)	dsUXAEzaC3Q
+481-Beyoncé - Best Thing I Never Had (Video)	FHp2KgyQUFk
+482-Shawn Mendes, Camila Cabello - I Know What You Did Last Summer (Official Video)	ngORmvyvAaI
+483-Drake - Take Care ft. Rihanna	-zzP29emgpg
+484-One Direction - Steal My Girl	UpsKGvPjAgw
+485-Selena Gomez - Slow Down (Official)	Z8eXaXoUJRQ
+486-Jennifer Lopez - Booty ft. Iggy Azalea (Official Video)	nxtIRArhVD4
+487-Demi Lovato - Cool for the Summer (Official Video)	il9nqWw9W3Y
+488-Tove Lo - Habits (Stay High)	oh2LWWORoiM
+489-WALK THE MOON - Shut Up and Dance (Official Video)	6JCLY0Rlx6Q
+490-One Direction - Little Things	xGPeNN9S0Fg
+491-Big Sean - I Don't Fuck With You (Official Music Video) ft. E-40	cZaJYDPY-YQ
+492-Enrique Iglesias - No Me Digas Que No (Official Music Video) ft. Wisin & Yandel	zyqt2avPkoA
+494-Taylor Swift - Everything Has Changed ft. Ed Sheeran	w1oM3kQpXRo
+495-Britney Spears - Work B_ch (Official Music Video)	pt8VYOfr8To
+496-Nacho - Bailame	a1J44C-PZ3E
+497-Axel - Te Voy A Amar	KZh60U1PqSE
+498-Route 94 - My Love (Official Video) ft. Jess Glynne	BS46C2z5lVE
+499-Kendji Girac - Andalouse (Clip Officiel)	FndmvPkI1Ms
+500-Little Mix - Touch (Official Video)	gBAfejjUQoA
+501-Iggy Azalea - Work (Official Music Video)	_zR6ROjoOX0
+502-Wisin & Yandel - Estoy Enamorado	whBcmlaSLJM
+503-Alicia Keys - Girl on Fire (Official Video)	J91ti_MpdHA
+504-Avril Lavigne - What The Hell (Official Music Video)	tQmEd_UeeIk
+505-Zara Larsson - Uncover (Official Music Video)	U-PXEe-qeK4
+506-Lady Gaga - Just Dance ft. Colby O'Donis (Official Music Video) ft. Colby O'Donis	2Abk1jAONjw
+507-Maluma - La Temperatura (Video) ft. Eli Palacios	Tgt6iaSYMEM
+508-Akon - Sorry, Blame It On Me (Official Music Video)	ynMk2EwRi4Q
+509-CNCO, Yandel - Hey DJ (Official Video)	X6wQOW9ihDA
+510-Selena Gomez & The Scene - Naturally	a_YR4dKArgo
+511-Eminem - Space Bound (Official Video)	JByDbPn6A1o
+512-YG - My Nigga ft. Jeezy, Rich Homie Quan (Explicit) (Official Music Video)	MSrTnWDTdwI
+513-August Alsina - No Love ft. Nicki Minaj	nxvm4P0jFKY
+514-Farruko - Obsesionado (Official Video)	lkN51aqPOzU
+515-Rihanna - Hate That I Love You ft. Ne-Yo	KMOOr7GEkj8
+516-Madonna - Bitch I'm Madonna ft. Nicki Minaj	7hPMmzKs62w
+517-Selena Gomez & The Scene - Who Says	BzE1mX4Px0I
+518-Ariana Grande - One Last Time (Official)	BPgEgaPk62M
+519-Calvin Harris - Sweet Nothing (Official Video) ft. Florence Welch	17ozSeGw-fY
+520-Maroon 5 - Misery (Official Music Video)	6g6g2mvItp4
+521-Jay-Z & Kanye West - Ni_as In Paris (Explicit)	gG_dA32oH44
+523-Beyoncé - Sorry (Video)	QxsmWxxouIM
+524-The Weeknd - Reminder (Official Video)	JZjAg6fK-BQ
+525-Pusho - Te Fuiste ft. Ozuna	aZOGcaU7q1A
+526-Jeremih - Down On Me ft. 50 Cent (Official Music Video)	AaXaig_43lU
+527-Jordin Sparks, Chris Brown - No Air (Official Video) ft. Chris Brown	WBKnpyoFEBo
+528-Marc Anthony - Valio La Pena (Salsa Version)	Ns9YYSqLxyI
+529-Prince Royce - Back It Up (Official Video) ft. Jennifer Lopez, Pitbull	9w9dXWU5nMI
+530-Eminem - Cleanin' Out My Closet (Official Music Video)	RQ9_TKayu9s
+531-Chris Brown - Kiss Kiss ft. T-Pain	eNII9PDlFJ0
+532-Avicii vs Nicky Romero - I Could Be The One (Nicktim)	bek1y2uiQGA
+533-Jessie J - Domino (Official Video)	UJtB55MaoD0
+534-Don Omar - Zumba Campaign Video	8HpG0l9cLos
+535-Britney Spears - Womanizer (Director's Cut) (Official HD Video)	rMqayQ-U74s
+536-Demi Lovato - Confident (Official Video)	cwLRQn61oUY
+537-Usher - DJ Got Us Fallin' In Love (Official Music Video) ft. Pitbull	C-dvTjK_07c
+538-Beyoncé - Pretty Hurts (Video)	LXXQLa-5n5w
+539-Akon - I Wanna Love You ft. Snoop Dogg	GJzF7H2e3Tw
+540-Of Monsters And Men - Little Talks (Official Video)	ghb6eDopW8I
+541-Enrique Iglesias - I Like It (Official Music Video)	X9_n8jakvWU
+542-Michael Jackson, Justin Timberlake - Love Never Felt So Good (Official Video)	oG08ukJPtR8
+543-Akon - Beautiful (Official Music Video) ft. Colby O'Donis, Kardinal Offishall	rSOzN0eihsE
+544-Farruko - Sunset (Official Video) ft. Shaggy, Nicky Jam	ZBMsSPR9QMg
+545-Ace Hood - Bugatti (Official Music Video) (Explicit) ft. Future, Rick Ross	djE-BLrdDDc
+546-Jennifer Lopez - I'm Into You ft. Lil Wayne	IgLcQmlN2Xg
+547-Calibre 50 - Contigo	oeeNs3KInbc
+548-will.i.am - Feelin' Myself ft. Miley Cyrus, Wiz Khalifa, French Montana	VRuoR--LdqQ
+549-Diddy - Dirty Money - Coming Home ft. Skylar Grey (Official Video)	k-ImCpNqbJw
+550-Bon Jovi - You Give Love A Bad Name (Official Music Video)	KrZHPOeOxQQ
+551-Chamillionaire - Ridin' (Official Music Video) ft. Krayzie Bone	CtwJvgPJ9xw
+552-Zedd - Clarity ft. Foxes (Official Music Video)	IxxstCcJlsc
+553-Justin Bieber - Confident ft. Chance The Rapper (Official Music Video)	47YClVMlthI
+554-Lana Del Rey - Blue Jeans	JRWox-i6aAk
+555-blink-182 - I Miss You (Official Video)	s1tAYmMjLdY
+556-Fergie - M.I.L.F. $ (Official Music Video)	bsUWK-fixiA
+557-Taylor Swift - Mine	XPBwXKgDTdE
+558-Three Days Grace - I Hate Everything About You (Official Video)	d8ekz_CSBVg
+559-T.I. - About The Money ft. Young Thug (Official Music Video)	etfIdtm-OC8
+560-will.i.am - This Is Love ft. Eva Simons (Official Music Video)	9I9Ar6upx34
+561-Kid Ink - Show Me (Explicit) ft. Chris Brown	xKkb13IU_DE
+562-will.i.am - #thatPOWER ft. Justin Bieber (Official Music Video)	DGIgXP9SvB8
+563-Marc Anthony - Ahora Quien (Salsa Version)	toLrTToaN0M
+564-Future - Where Ya At ft. Drake	lw3Or6eqIpI
+565-Taylor Swift - …Ready For It	wIft-t-MQuE
+566-Young Money - Bed Rock (Official Music Video)	Ha80ZaecGkQ
+567-Romeo Santos - Promise ft. Usher	Y3XyWhrZnqE
+568-Chris Brown, Tyga - Ayo (Official Video)	zKCrSN9oXgQ
+569-Hailee Steinfeld - Love Myself (Official Video)	bMpFmHSgC4Q
+570-Justin Bieber - Never Let You Go (Official Music Video)	3ExWsVFJlFo
+571-Rihanna - California King Bed	nhBorPm6JjQ
+572-Ways to Be Wicked (from Descendants 2) (Official Video)	lX6g_cm2rM4
+573-Leona Lewis - Bleeding Love (US Version - Official Video)	Vzo-EL_62fQ
+574-Labrinth - Beneath Your Beautiful (Official Video) ft. Emeli Sandé	bqIxCtEveG8
+575-Marc Anthony - A Quién Quiero Mentirle (Video)	GeApuPcMVeQ
+576-Chris Brown - Next To You (Official Music Video) ft. Justin Bieber	EEuQU6a90Pc
+577-Sia - Big Girls Cry (Official Video)	4NhKWZpkw1Q
+578-Shakira - She Wolf (Official HD Video)	booKP974B0k
+579-Jay Sean - Down ft. Lil Wayne (Official Music Video)	oUbpGmR1-QM
+581-Avicii - Addicted To You	Qc9c12q3mrc
+582-Pitbull - Hotel Room Service (Official Video)	2up_Eq6r6Ko
+583-Lady Gaga - Paparazzi (Official Music Video)	d2smz_1L2_0
+584-Chris Brown - Yeah 3x	3mC2ixOAivA
+585-Nick Jonas - Close ft. Tove Lo	XgJFqVvb2Ws
+586-Hailee Steinfeld, Grey - Starving ft. Zedd (Official Video)	xwjwCFZpdns
+587-Ne-Yo - One In A Million (Official Music Video)	6tpl9LtkRRw
+588-Lady Gaga - Born This Way (Official Music Video)	wV1FrqwZyKw
+589-Chris Brown - Turn Up the Music	eQWG8BVeryU
+590-Juan Magan - Si No Te Quisiera ft. Belinda, Lapiz Conciente	XoNCV9BsU9c
+591-Katy Perry - Teenage Dream (Official Music Video)	98WtmW-lfeE
+592-Cher Lloyd - Want U Back (US Version) (Official Music Video)	LPgvNlrBfb0
+593-Shakira - Addicted to You	MntbN1DdEP0
+594-Sebastián Yatra - Alguien Robó ft. Wisin, Nacho	EH0Wg8SaITQ
+595-Chris Brown - Forever (Official HD Video)	5sMKX22BHeE
+596-Snow Patrol - Chasing Cars (Official Video)	GemKqzILV4w
+597-Lil Wayne - Drop The World ft. Eminem (Official Music Video) ft. Eminem	ErCAOMi5EGM
+598-Miley Cyrus - 7 Things (Official Video)	Hr0Wv5DJhuk
+599-Matheus & Kauan - O Nosso Santo Bateu – Na Praia Ao Vivo	kbCtpDwVCLQ
+600-Maejor Ali - Lolly ft. Juicy J, Justin Bieber	BiOmXeKyrxo
+601-Kings Of Leon - Use Somebody (Official Video)	gnhXHvRoUd0
+602-Pitbull - Fireball ft. John Ryan	HMqgVXSvwGo
+603-Calvin Harris - Feel So Close (Official Video)	dGghkjpNCQ8
+604-Carly Rae Jepsen - I Really Like You	qV5lzRHrGeg
+605-Demi Lovato - Skyscraper (Official Video)	r_8ydghbGSg
+606-Keri Hilson - Knock You Down (Official Music Video) ft. Kanye West, Ne-Yo	p_RqWocthcc
+607-Wisin & Yandel - Te Siento	SKWxOsbt9gU
+608-The Black Eyed Peas - Just Can't Get Enough (Official Music Video)	OrTyD7rjBpw
+609-Jennifer Lopez - Live It Up ft. Pitbull	BofL1AaiTjo
+610-Eminem - Just Lose It (Official Music Video)	9dcVOmEQzKA
+612-The Black Eyed Peas - Don't Stop The Party (Official Music Video)	u9LH_y159sg
+613-Tinie Tempah - Written In The Stars ft. Eric Turner	YgFyi74DVjc
+614-Big Sean - Blessings (Official Explicit Video) ft. Drake, Kanye West	M6t47RI4bns
+615-Britney Spears - I Wanna Go (Official Video)	T-sxSd1uwoU
+616-Rihanna - Russian Roulette (Official Music Video)	ZQ2nCGawrSY
+617-Ellie Goulding - On My Mind (Official Video)	H202k7KfZL0
+618-Pitbull - Hey Baby (Drop It To The Floor) ft. T-Pain	LefQdEMJP1I
+619-Maître Gims - J'me tire (Clip officiel)	F_rEHfLgdcY
+620-LMFAO - Champagne Showers ft. Natalia Kills	UA8rcLvS1BY
+621-Nicki Minaj - Pound The Alarm (Explicit)	vdrqA93sW-8
+622-Maluma - La Curiosidad	9t7eMteW-Tc
+623-Shakira - Rabiosa (English Version) ft. Pitbull	a5irTX82olg
+624-Rich Homie Quan - Type of Way (Official Video)	-KKbdErJkiY
+625-P!nk - Just Like Fire (From'Alice Through The Looking Glass' - Official Video)	5Nrv5teMc9Y
+626-Rihanna - What Now (Official)	b-3BI9AspYc
+627-Camila - De Que Me Sirve la Vida	3YhoejhnW8w
+628-Jennifer Lopez - Goin' In ft. Flo Rida	z5W7DVFKrcs
+629-LMFAO ft. Lil Jon - Shots (Official Video)	XNtTEibFvlQ
+630-Ciara - Like A Boy (Official Video)	_HKH7Emy1SY
+631-Calvin Harris & Alesso - Under Control (Official Video) ft. Hurts	yZqmarGShxg
+632-Fifth Harmony - BO$$ (BOSS)	Y4JfPlry-iQ
+633-Eminem - Berzerk (Official Music Video) (Explicit)	ab9176Srb5Y
+634-Years & Years - King (Official Video)	g_uoH6hJilc
+635-Ne-Yo - So Sick (Official Music Video)	IxszlJppRQI
+636-Selena Gomez & The Scene - A Year Without Rain	M8uPvX2te0I
+637-Daddy Yankee - Sabado Rebelde ft. Plan B	0nPkXDrL2ZU
+638-Kanye West - All Of The Lights ft. Rihanna, Kid Cudi	HAfFfqiYLp0
+639-Zedd - Stay The Night ft. Hayley Williams (Official Music Video)	i-gyZ35074k
+640-Yandel - Como Antes (Official Video) ft. Wisin	QeaumjX9DNY
+641-Taylor Swift - Back To December	QUwxKWT6m7U
+642-Romeo Santos - Rival (Official Video) ft. Mario Domm	6vMhhBRj-2Q
+643-Henrique & Diego - Suíte 14 (Ao Vivo) ft. Mc Guimê	gmvFLIuVAbA
+644-Britney Spears - Gimme More (Official HD Video)	elueA2rofoo
+645-Rihanna - You Da One	b3HeLs8Yosw
+646-Avicii - Hey Brother	6Cp6mKbRTQY
+647-Soulja Boy Tell'em ft. Sammie - Kiss Me Thru The Phone (Official Video)	47Fbo4kU2AU
+648-Beyoncé - Partition (Explicit Video)	pZ12_E5R3qc
+649-Kid Cudi - Pursuit Of Happiness (Official Music Video) ft. MGMT	7xzU9Qqdqww
+650-Sigala - Sweet Lovin' ft. Bryn Christopher (Official Video)	qj5zT4t7S6c
+651-The Game - My Life ft. Lil Wayne (Official Music Video)	udxZ9zkDzpo
+652-Nicki Minaj - Moment 4 Life (Clean Version) (Official Music Video) ft. Drake	D7GW8TYCEG4
+653-Nicki Minaj - High School (Explicit) ft. Lil Wayne	RnpyRe_7jZA
+654-Chino & Nacho - Niña Bonita	Oe1fRwgGu5E
+655-Far East Movement ft. The Cataracs, DEV - Like A G6 (Official Video)	w4s6H4ku6ZY
+656-Pitbull, Ne-Yo - Time Of Our Lives	bTXJQ5ql5Fw
+657-Lorde - Team	f2JuxM-snGc
+658-Christina Aguilera - Candyman (Official Music Video)	-ScjucUV8v0
+659-Katy Perry - I Kissed A Girl (Official)	tAp9BKosZXs
+660-One Direction - Gotta Be You	nvfejaHz-o0
+661-Nicki Minaj - Pills N Potions (Official)	f7ld-3nZUxA
+662-Mohombi - Bumpy Ride	G2RCCDSBEGk
+663-Demi Lovato - Neon Lights (Official Video)	v9uDwppN5-w
+664-The Pussycat Dolls - When I Grow Up (Official Music Video)	K0K46C82v9o
+665-Chris Brown - Don't Wake Me Up (Official Music Video)	QOowQeKyNkQ
+666-Christina Aguilera - Hurt (Main Video)	wwCykGDEp7M
+667-Eminem - We Made You (Official Music Video)	RSdKmX2BH7o
+668-Taio Cruz - Break Your Heart (Official Video) ft. Ludacris	y_SI2EDM6Lo
+669-Demi Lovato - Really Don't Care ft. Cher Lloyd (Official Video)	OJGUbwVMBeA
+670-P!nk - Raise Your Glass (Official Video)	XjVNlG5cZyQ
+671-Austin Mahone - Mmm Yeah ft. Pitbull	MMAppa1cAVo
+672-Avril Lavigne - Smile (Official Music Video)	KagvExF-ijc
+673-Little Mix - Wings (Official Video)	cOQDsmEqVt8
+674-Brandon Beal - Twerk It Like Miley - Produced by Hedegaard ft. Christopher	PLE57UZievU
+675-El Bebeto - No Te Creas Tan Importante (Video Oficial)	nMv2PeG-2mc
+676-Eminem - My Name Is (Official Music Video)	sNPnbI1arSE
+677-Justin Bieber - All That Matters	JC2yu2a9sHk
+678-The Wanted - Glad You Came	2ggzxInyzVE
+679-Maluma - Addicted (Official Music Video)	pMIHC_cItd4
+680-Pitbull - Fun (Official Video) ft. Chris Brown	jKbR7u8J5PU
+681-Thalia - Desde Esa Noche (Official Video) ft. Maluma	CkyBXdXkMr8
+682-Michael Jackson - You Rock My World (Official Video)	1-7ABIM2qjU
+683-The Band Perry - If I Die Young (Official Video)	7NJqUN9TClM
+684-Alessia Cara - Here (Official Video)	UKp2CrfmVfw
+685-Güliz Ayla - Olmazsan Olmaz	j-T4hRJNFJI
+686-Rihanna - Disturbia	E1mU6h4Xdxc
+687-Beyoncé - Diva	rNM5HW13_O8
+688-Bridgit Mendler - Ready or Not (Official Video)	dPKG1-3LXBs
+689-Nicki Minaj - Beez In The Trap (Explicit) ft. 2 Chainz	EmZvOhHF85I
+691-Snoop Dogg - 'Sweat' Snoop Dogg vs David Guetta (Remix) [Official Video]	KnEXrbAQyIo
+692-Olly Murs - Troublemaker ft. Flo Rida	4aQDOUbErNg
+693-Ciara ft. Ludacris - Ride (Official Video)	Lp6W4aK1sbs
+694-Chris Brown - Don't Judge Me	z29nI8RQV0U
+695-Kendrick Lamar - LOYALTY. ft. Rihanna	Dlh-dzB2U4Y
+696-Chris Brown - Love More (Explicit) ft. Nicki Minaj	Tff2oE31Mlw
+697-Christina Aguilera - Your Body (Official Music Video) (Clean Version)	6cfCgLgiFDM
+699-MIKA - Popular Song ft. Ariana Grande	nmcdLOjGVzw
+700-Britney Spears - Till The World Ends (Official Video)	qzU9OrZlKb8
+701-Zendaya - Replay	cyLE48i4XY0
+702-Shontelle - Impossible (Official Video)	NWdrO4BoCu8
+703-Nicki Minaj, Cassie - The Boys (Explicit)	kXFcr6oy5dk
+704-Miley Cyrus - Can't Be Tamed (Official Video)	sjSG6z_13-Q
+705-Little Mix - Move (Official Video)	RwD4eJGxPc4
+706-Selena Gomez - Tell Me Something I Don't Know	_RRyniZG0Jo
+707-Britney Spears - Circus (Official HD Video)	lVhJ_A8XUgc
+708-Avril Lavigne - Here's to Never Growing Up	sXd2WxoOP5g
+709-Lady Gaga - LoveGame (Official Music Video)	1mB0tP1I-14
+710-Chris Brown - Gimme That (remix) ft. Lil Wayne	3yl-5FOZcr0
+711-Beyoncé - Sweet Dreams	JlxByc0-V40
+712-Leona Lewis - Bleeding Love (Official Video)	7_weSk0BonM
+713-Taio Cruz - Dynamite (Official Video)	VUjdiDeJ0xg
+714-Tinashe - 2 On (Explicit) ft. SchoolBoy Q	-s7TCuCpB5c
+715-Natalie La Rose - Somebody ft. Jeremih	8zqdo_Umd5c
+717-Usher - OMG ft. will.i.am	1RnPB76mjxI
+718-Taylor Swift - Our Song	Jb2stN7kH28
+719-Lil Wayne - How To Love (Official Music Video)	y8Gf4-eT3w0
+720-Nicole Scherzinger - Right There ft. 50 Cent	t-vTaktsUSw
+721-OneRepublic - Good Life (Official Music Video)	jZhQOvvV45w
+722-Britney Spears, Iggy Azalea - Pretty Girls (Official Video)	uV2uebhnqOw
+723-Ellie Goulding - Lights (Official Video)	0NKUpo_xKyQ
+724-Miley Cyrus - Adore You (Official Video)	W1tzURKYFNs
+725-Kanye West - Heartless	Co0tTeuUVhU
+726-Rihanna - Te Amo (Official Music Video)	Oe4Ic7fHWf8
+727-Ariana Grande - Baby I	bJuWlMFToNo
+728-Vanessa Hudgens - Say Ok (Official Music Video)	F5VvvVxuKko
+729-DJ Khaled - I'm On One (Explicit Version) ft. Drake, Rick Ross, Lil Wayne	Z09lYqdxqzo
+730-Demi Lovato - Made in the USA (Official Video)	z3zdIHDTbg0
+731-Train - Drive By (Official Music Video)	oxqnFJ3lp5k
+732-Eminem - The Way I Am (Official Music Video)	mQvteoFiMlg
+733-Timbaland - Carry Out (Official Music Video) ft. Justin Timberlake	NRdHsuuXxfk
+734-Daddy Yankee - La Noche De Los Dos ft. Natalia Jiménez	GDBaeQ5JPuU
+735-Justin Bieber - U Smile (Official Music Video)	r2ozuCXpVJY
+736-Ke$ha - Die Young (Official)	NOubzHCUt48
+737-Nick Jonas - Jealous	yw04QD1LaB0
+739-Ariana Grande - Right There ft. Big Sean	fhcpubAVdmc
+740-Selena Gomez & The Scene - Hit The Lights	8c2ahBlTPz0
+741-Eminem - Survival (Explicit)	NlmezywdxPI
+742-Miley Cyrus - Who Owns My Heart (Official Video)	iVbQxC2c3-8
+743-Rihanna - Cheers (Drink To That)	ZR0v0i63PQ4
+744-Sigala - Easy Love (Official Music Video)	ozx898ADTxM
+745-Farruko - Besas Tan Bien (Official Video)	E-kkX2UuBcg
+746-OneRepublic - All The Right Moves (Official Music Video)	qrOeGCJdZe4
+747-Enrique Iglesias, Usher - Dirty Dancer ft. Lil Wayne	vHJAUuicC0Q
+748-Austin Mahone - What About Love (Official Video)	2PEG82Udb90
+749-Rihanna - Hard (Official Music Video) ft. Jeezy	Xcwd_Nz6Zog
+750-Lady Gaga - Perfect Illusion (Official Music Video)	Xn599R0ZBwg
+752-MGMT - Electric Feel (Official HD Video)	MmZexg8sxyk
+753-'Weird Al' Yankovic - White & Nerdy (Official Music Video)	N9qYF9DZPdw
+754-Taylor Swift - White Horse	D1Xr-JFLxik
+755-Miley Cyrus - When I Look At You (Official Video)	8wxOVn99FTE
+756-Ne-Yo - Let Me Love You (Until You Learn To Love Yourself) (Official Music Video)	crrOl0egI00
+757-Cher Lloyd - Oath (Official Music Video) ft. Becky G	Cqz713hhz1Y
+758-Timbaland - If We Ever Meet Again ft. Katy Perry (Official Music Video)	KDKva-s_khY
+759-'Watch Me' from Disney Channel's 'Shake It Up' (Official Video)	PPNMGYOm1aM
+761-Taio Cruz - Hangover (Official Video) ft. Flo Rida	dLhFDYQHDQY
+762-Daddy Yankee - Ven Conmigo ft. Prince Royce	ZEInlYjVFzk
+765-Demi Lovato - La La Land (Official Music Video)	nmjO1p9Oxrk
+766-Selena Gomez & The Scene - Round & Round	UfcvO2t8Ntg
+767-Britney Spears - Hold It Against Me (Official Video)	-Edv8Onsrgg
+768-Far East Movement - Turn Up The Love ft. Cover Drive	UqXVgAmqBOs
+769-Justin Bieber - Pray (Official Music Video)	o9tJW9MDs2M
+770-Drake - Find Your Love (Official Music Video)	Xyv4Bjja8yc
+772-Nicki Minaj - Va Va Voom (Explicit)	3U72hzeBLOw
+773-will.i.am, Nicki Minaj - Check It Out (Official Music Video)	pqky5B179nM
+774-Nicki Minaj - Stupid Hoe (Explicit)	T6j4f8cHBIM
+775-Jennifer Lopez ft. French Montana - I Luh Ya Papi (Explicit) [Official Video]	c4oiEhf9M04
diff --git a/dataset/vevo_meta/top_chord.txt b/dataset/vevo_meta/top_chord.txt
new file mode 100644
index 0000000000000000000000000000000000000000..26456cc45487d0d14cc23aae16e163403dc01995
--- /dev/null
+++ b/dataset/vevo_meta/top_chord.txt
@@ -0,0 +1,30 @@
+C 1 32576
+A:min 122 31898
+F 66 22538
+G 92 22137
+E:min 57 7935
+D:min 31 6457
+D 27 3973
+A:min7 121 3846
+A 118 3606
+E 53 2613
+D:min7 30 2598
+F:maj7 78 2530
+A# 131 1854
+E:min7 56 1695
+E:7 63 1396
+G:7 102 1321
+C:maj7 13 1039
+C:7 11 791
+D:7 37 697
+G:min 96 685
+C:min 5 684
+B:min 148 528
+F:min 70 474
+B 144 459
+D# 40 459
+G# 105 452
+A:7 128 391
+F:7 76 384
+G:sus4 94 384
+G:min7 95 277
diff --git a/default_sound_font.sf2 b/default_sound_font.sf2
new file mode 100644
index 0000000000000000000000000000000000000000..14b4bfccc13d330c811e8a2b4630d314173e40fe
--- /dev/null
+++ b/default_sound_font.sf2
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74594e8f4250680adf590507a306655a299935343583256f3b722c48a1bc1cb0
+size 148398306
diff --git a/model/__pycache__/music_transformer.cpython-37.pyc b/model/__pycache__/music_transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c092bb158323b32021470593d5d8c0041adb85c
Binary files /dev/null and b/model/__pycache__/music_transformer.cpython-37.pyc differ
diff --git a/model/__pycache__/positional_encoding.cpython-37.pyc b/model/__pycache__/positional_encoding.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43d4430de58959375ba40dc1250804bf7fd81868
Binary files /dev/null and b/model/__pycache__/positional_encoding.cpython-37.pyc differ
diff --git a/model/__pycache__/positional_encoding.cpython-38.pyc b/model/__pycache__/positional_encoding.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a418d963e1a78f4e87f310189c714cd49776b1e
Binary files /dev/null and b/model/__pycache__/positional_encoding.cpython-38.pyc differ
diff --git a/model/__pycache__/rpr.cpython-37.pyc b/model/__pycache__/rpr.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4deeff2fa2df843f3927bf9280e008b82e76ae20
Binary files /dev/null and b/model/__pycache__/rpr.cpython-37.pyc differ
diff --git a/model/__pycache__/rpr.cpython-38.pyc b/model/__pycache__/rpr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3418d5d9daa7e7553a735bad69e6dfe1cb3917a9
Binary files /dev/null and b/model/__pycache__/rpr.cpython-38.pyc differ
diff --git a/model/__pycache__/video_music_transformer.cpython-37.pyc b/model/__pycache__/video_music_transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b4b3efb1730d7849ae97674a26859debe332cc9
Binary files /dev/null and b/model/__pycache__/video_music_transformer.cpython-37.pyc differ
diff --git a/model/__pycache__/video_music_transformer.cpython-38.pyc b/model/__pycache__/video_music_transformer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cf86a1bfae9baf9d0733a0e96b79845a95acb7b
Binary files /dev/null and b/model/__pycache__/video_music_transformer.cpython-38.pyc differ
diff --git a/model/__pycache__/video_regression.cpython-37.pyc b/model/__pycache__/video_regression.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e4dd9d485e65d7310362da2cceb6bb3b7d5482b
Binary files /dev/null and b/model/__pycache__/video_regression.cpython-37.pyc differ
diff --git a/model/__pycache__/video_regression.cpython-38.pyc b/model/__pycache__/video_regression.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff92ea3fd9606ff5515b9132657ba899ca62c1
Binary files /dev/null and b/model/__pycache__/video_regression.cpython-38.pyc differ
diff --git a/model/loss.py b/model/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ff3d5d39a13ced021e1b9ca27973804a3262e7
--- /dev/null
+++ b/model/loss.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+
+# Borrowed from https://github.com/jason9693/MusicTransformer-pytorch/blob/5f183374833ff6b7e17f3a24e3594dedd93a5fe5/custom/criterion.py#L28
+class SmoothCrossEntropyLoss(_Loss):
+    """
+    https://arxiv.org/abs/1512.00567
+    """
+    __constants__ = ['label_smoothing', 'vocab_size', 'ignore_index', 'reduction']
+
+    def __init__(self, label_smoothing, vocab_size, ignore_index=-100, reduction='mean', is_logits=True):
+        assert 0.0 <= label_smoothing <= 1.0
+        super().__init__(reduction=reduction)
+
+        self.label_smoothing = label_smoothing
+        self.vocab_size = vocab_size
+        self.ignore_index = ignore_index
+        self.input_is_logits = is_logits
+
+    def forward(self, input, target):
+        """
+        Args:
+            input: [B * T, V]
+            target: [B * T]
+        Returns:
+            cross entropy: [1]
+        """
+        mask = (target == self.ignore_index).unsqueeze(-1)
+        q = F.one_hot(target.long(), self.vocab_size).type(torch.float32)
+        u = 1.0 / self.vocab_size
+        q_prime = (1.0 - self.label_smoothing) * q + self.label_smoothing * u
+        q_prime = q_prime.masked_fill(mask, 0)
+
+        ce = self.cross_entropy_with_logits(q_prime, input)
+        if self.reduction == 'mean':
+            lengths = torch.sum(target != self.ignore_index)
+            return ce.sum() / lengths
+        elif self.reduction == 'sum':
+            return ce.sum()
+        else:
+            raise NotImplementedError
+
+    def cross_entropy_with_logits(self, p, q):
+        return -torch.sum(p * (q - q.logsumexp(dim=-1, keepdim=True)), dim=-1)
diff --git a/model/music_transformer.py b/model/music_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..633b2b335c098ae2c19d023a5ece8424e559034c
--- /dev/null
+++ b/model/music_transformer.py
@@ -0,0 +1,177 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+
+from utilities.constants import *
+from utilities.device import get_device
+
+from .positional_encoding import PositionalEncoding
+from .rpr import TransformerEncoderRPR, TransformerEncoderLayerRPR
+import json
+# MusicTransformer
+class MusicTransformer(nn.Module):
+    def __init__(self, n_layers=6, num_heads=8, d_model=512, dim_feedforward=1024,
+                 dropout=0.1, max_sequence_midi=2048, max_sequence_chord=300,  rpr=False):
+        super(MusicTransformer, self).__init__()
+
+        self.dummy      = DummyDecoder()
+        self.nlayers    = n_layers
+        self.nhead      = num_heads
+        self.d_model    = d_model
+        self.d_ff       = dim_feedforward
+        self.dropout    = dropout
+        self.max_seq_midi    = max_sequence_midi
+        self.max_seq_chord    = max_sequence_chord
+        self.rpr        = rpr
+
+        # Input embedding for video and music features
+        self.embedding = nn.Embedding(CHORD_SIZE, self.d_model)
+
+        # self.embedding_key = nn.Embedding(1, self.d_model)
+        self.embedding_root = nn.Embedding(CHORD_ROOT_SIZE, self.d_model)
+        self.embedding_attr = nn.Embedding(CHORD_ATTR_SIZE, self.d_model)
+
+        self.positional_encoding = PositionalEncoding(self.d_model, self.dropout, self.max_seq_chord)
+        self.Linear_chord     = nn.Linear(self.d_model+1, self.d_model)
+
+        # Base transformer
+        if(not self.rpr):
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff, custom_decoder=self.dummy
+            )
+        # RPR Transformer
+        else:
+            encoder_norm = LayerNorm(self.d_model)
+            encoder_layer = TransformerEncoderLayerRPR(self.d_model, self.nhead, self.d_ff, self.dropout, er_len=self.max_seq_chord)
+
+            encoder = TransformerEncoderRPR(encoder_layer, self.nlayers, encoder_norm)
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff, custom_decoder=self.dummy, custom_encoder=encoder
+            )
+        # Final output is a softmaxed linear layer
+        self.Wout       = nn.Linear(self.d_model, CHORD_SIZE)
+        self.Wout_root       = nn.Linear(self.d_model, CHORD_ROOT_SIZE)
+        self.Wout_attr       = nn.Linear(self.d_model, CHORD_ATTR_SIZE)
+        self.softmax    = nn.Softmax(dim=-1)
+
+    # forward
+    def forward(self, x, x_root, x_attr, feature_key, mask=True):
+        if(mask is True):
+            mask = self.transformer.generate_square_subsequent_mask(x.shape[1]).to(get_device())
+        else:
+            mask = None
+
+        ### Chord + Key (DECODER) ###
+        # x = self.embedding(x)
+        
+        x_root = self.embedding_root(x_root)
+        x_attr = self.embedding_attr(x_attr)
+        x = x_root + x_attr
+
+        feature_key_padded = torch.full((x.shape[0], x.shape[1], 1), feature_key.item())
+        feature_key_padded = feature_key_padded.to(get_device())
+        x = torch.cat([x, feature_key_padded], dim=-1)
+        xf = self.Linear_chord(x)
+
+        ### POSITIONAL ENCODING ###
+        xf = xf.permute(1,0,2) # -> (max_seq-1, batch_size, d_model)
+        xf = self.positional_encoding(xf)
+        
+        ### TRANSFORMER ###
+        x_out = self.transformer(src=xf, tgt=xf, tgt_mask=mask)
+        x_out = x_out.permute(1,0,2)
+    
+        if IS_SEPERATED:
+            y_root = self.Wout_root(x_out)
+            y_attr = self.Wout_attr(x_out)
+            del mask
+            return y_root, y_attr
+        else:
+            y = self.Wout(x_out)
+            del mask
+            return y
+
+    # generate
+    def generate(self, feature_key=None, primer=None, primer_root=None, primer_attr=None, target_seq_length=300, beam=0, beam_chance=1.0):
+        assert (not self.training), "Cannot generate while in training mode"
+
+        with open('dataset/vevo_meta/chord_inv.json') as json_file:
+            chordInvDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_root.json') as json_file:
+            chordRootDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_attr.json') as json_file:
+            chordAttrDic = json.load(json_file)
+
+        print("Generating sequence of max length:", target_seq_length)
+        gen_seq = torch.full((1,target_seq_length), CHORD_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+        gen_seq_root = torch.full((1,target_seq_length), CHORD_ROOT_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+        gen_seq_attr = torch.full((1,target_seq_length), CHORD_ATTR_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+        
+        num_primer = len(primer)
+
+        gen_seq[..., :num_primer] = primer.type(TORCH_LABEL_TYPE).to(get_device())
+        gen_seq_root[..., :num_primer] = primer_root.type(TORCH_LABEL_TYPE).to(get_device())
+        
+        gen_seq_attr[..., :num_primer] = primer_attr.type(TORCH_LABEL_TYPE).to(get_device())
+
+        cur_i = num_primer
+        while(cur_i < target_seq_length):
+            # gen_seq_batch     = gen_seq.clone()
+            # y = self.softmax(self.forward(gen_seq[..., :cur_i]))[..., :CHORD_END]
+            y = self.softmax( self.forward( gen_seq[..., :cur_i], gen_seq_root[..., :cur_i], gen_seq_attr[..., :cur_i], feature_key) )[..., :CHORD_END]
+            
+            token_probs = y[:, cur_i-1, :]
+            if(beam == 0):
+                beam_ran = 2.0
+            else:
+                beam_ran = random.uniform(0,1)
+            if(beam_ran <= beam_chance):
+                token_probs = token_probs.flatten()
+                top_res, top_i = torch.topk(token_probs, beam)
+                beam_rows = top_i // CHORD_SIZE
+                beam_cols = top_i % CHORD_SIZE
+                gen_seq = gen_seq[beam_rows, :]
+                gen_seq[..., cur_i] = beam_cols
+            else:
+                distrib = torch.distributions.categorical.Categorical(probs=token_probs)
+                next_token = distrib.sample()
+                #print("next token:",next_token)
+                gen_seq[:, cur_i] = next_token
+                gen_chord = chordInvDic[ str( next_token.item() ) ]
+                
+                chord_arr = gen_chord.split(":")
+                if len(chord_arr) == 1:
+                    chordRootID = chordRootDic[chord_arr[0]]
+                    chordAttrID = 1
+                    chordRootID = torch.tensor([chordRootID]).to(get_device())
+                    chordAttrID = torch.tensor([chordAttrID]).to(get_device())
+                    gen_seq_root[:, cur_i] = chordRootID
+                    gen_seq_attr[:, cur_i] = chordAttrID
+                elif len(chord_arr) == 2:
+                    chordRootID = chordRootDic[chord_arr[0]]
+                    chordAttrID = chordAttrDic[chord_arr[1]]
+                    chordRootID = torch.tensor([chordRootID]).to(get_device())
+                    chordAttrID = torch.tensor([chordAttrID]).to(get_device())
+                    gen_seq_root[:, cur_i] = chordRootID
+                    gen_seq_attr[:, cur_i] = chordAttrID
+                    
+                # Let the transformer decide to end if it wants to
+                if(next_token == CHORD_END):
+                    print("Model called end of sequence at:", cur_i, "/", target_seq_length)
+                    break
+                
+            cur_i += 1
+            if(cur_i % 50 == 0):
+                print(cur_i, "/", target_seq_length)
+        return gen_seq[:, :cur_i]
+
+class DummyDecoder(nn.Module):
+    def __init__(self):
+        super(DummyDecoder, self).__init__()
+    def forward(self, tgt, memory, tgt_mask, memory_mask,tgt_key_padding_mask,memory_key_padding_mask):
+        return memory
diff --git a/model/positional_encoding.py b/model/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..5820a69d721c879cfcdb21db63b9aef83b98f2bb
--- /dev/null
+++ b/model/positional_encoding.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+import math
+
+# PositionalEncoding
+# Taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
diff --git a/model/rpr.py b/model/rpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1573451715f8cbcdb8834bc11f7372441d843d95
--- /dev/null
+++ b/model/rpr.py
@@ -0,0 +1,455 @@
+import torch
+import torch.nn as nn
+
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+from torch.nn import Module
+from torch.nn.modules.transformer import _get_clones
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.init import *
+
+from torch.nn.functional import linear, softmax, dropout
+from torch.nn import MultiheadAttention
+from typing import Optional
+
+class TransformerDecoderRPR(Module):
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoderRPR, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
+        output = tgt
+        for mod in self.layers:
+            output = mod(output, memory, tgt_mask=tgt_mask,
+                         memory_mask=memory_mask,
+                         tgt_key_padding_mask=tgt_key_padding_mask,
+                         memory_key_padding_mask=memory_key_padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+    
+class TransformerDecoderLayerRPR(Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
+        super(TransformerDecoderLayerRPR, self).__init__()
+        
+        self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model)
+
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+        
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
+                tgt_key_padding_mask=None, memory_key_padding_mask=None):
+        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+# TransformerEncoderRPR (only for music transformer)
+class TransformerEncoderRPR(Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoderRPR, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src, mask=None, src_key_padding_mask=None):
+        output = src
+        for i in range(self.num_layers):
+            output = self.layers[i](output, src_mask=mask,
+                                    src_key_padding_mask=src_key_padding_mask)
+        if self.norm:
+            output = self.norm(output)
+        return output
+
+# TransformerEncoderLayerRPR (only for music transformer)
+class TransformerEncoderLayerRPR(Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
+        super(TransformerEncoderLayerRPR, self).__init__()
+        self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+# MultiheadAttentionRPR
+class MultiheadAttentionRPR(Module):
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, er_len=None):
+        super(MultiheadAttentionRPR, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        # Adding RPR embedding matrix
+        if(er_len is not None):
+            self.Er = Parameter(torch.rand((er_len, self.head_dim), dtype=torch.float32))
+        else:
+            self.Er = None
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+
+        if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
+
+            return multi_head_attention_forward_rpr(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, rpr_mat=self.Er)
+        else:
+            if not hasattr(self, '_qkv_same_embed_dim'):
+                warnings.warn('A new version of MultiheadAttention module has been implemented. \
+                    Please re-train your model with the new module',
+                              UserWarning)
+
+            return multi_head_attention_forward_rpr(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, rpr_mat=self.Er)
+
+# multi_head_attention_forward_rpr
+def multi_head_attention_forward_rpr(query,                       # type: Tensor
+                                 key,                             # type: Tensor
+                                 value,                           # type: Tensor
+                                 embed_dim_to_check,              # type: int
+                                 num_heads,                       # type: int
+                                 in_proj_weight,                  # type: Tensor
+                                 in_proj_bias,                    # type: Tensor
+                                 bias_k,                          # type: Optional[Tensor]
+                                 bias_v,                          # type: Optional[Tensor]
+                                 add_zero_attn,                   # type: bool
+                                 dropout_p,                       # type: float
+                                 out_proj_weight,                 # type: Tensor
+                                 out_proj_bias,                   # type: Tensor
+                                 training=True,                   # type: bool
+                                 key_padding_mask=None,           # type: Optional[Tensor]
+                                 need_weights=True,               # type: bool
+                                 attn_mask=None,                  # type: Optional[Tensor]
+                                 use_separate_proj_weight=False,  # type: bool
+                                 q_proj_weight=None,              # type: Optional[Tensor]
+                                 k_proj_weight=None,              # type: Optional[Tensor]
+                                 v_proj_weight=None,              # type: Optional[Tensor]
+                                 static_k=None,                   # type: Optional[Tensor]
+                                 static_v=None,                   # type: Optional[Tensor]
+                                 rpr_mat=None
+                                 ):
+    """
+    ----------
+    Author: Pytorch
+    Modified: Damon Gwinn
+    ----------
+    For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+    https://pytorch.org/docs/1.2.0/_modules/torch/nn/functional.html
+    Modification to take RPR embedding matrix and perform skew optimized RPR (https://arxiv.org/abs/1809.04281)
+    ----------
+    """
+    # type: (...) -> Tuple[Tensor, Optional[Tensor]]
+
+    qkv_same = torch.equal(query, key) and torch.equal(key, value)
+    kv_same = torch.equal(key, value)
+    
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    assert list(query.size()) == [tgt_len, bsz, embed_dim]
+    assert key.size() == value.size()
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+
+    if use_separate_proj_weight is not True:
+        if qkv_same:
+            # self-attention
+            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+
+        elif kv_same:
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = linear(key, _w, _b).chunk(2, dim=-1)
+
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = linear(key, _w, _b)
+
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask,
+                                      torch.zeros((attn_mask.size(0), 1),
+                                                  dtype=attn_mask.dtype,
+                                                  device=attn_mask.device)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+                                                   dtype=key_padding_mask.dtype,
+                                                   device=key_padding_mask.device)], dim=1)
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1),
+                                                          dtype=attn_mask.dtype,
+                                                          device=attn_mask.device)], dim=1)
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+                                               dtype=key_padding_mask.dtype,
+                                               device=key_padding_mask.device)], dim=1)
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    ######### ADDITION OF RPR ###########
+    if(rpr_mat is not None):
+        rpr_mat = _get_valid_embedding(rpr_mat, q.shape[1], k.shape[1])
+        qe = torch.einsum("hld,md->hlm", q, rpr_mat)
+        srel = _skew(qe)
+        attn_output_weights += srel
+
+    if attn_mask is not None:
+        attn_mask = attn_mask.unsqueeze(0)
+        attn_output_weights += attn_mask
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+    attn_output_weights = softmax(
+        attn_output_weights, dim=-1)
+
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
+def _get_valid_embedding(Er, len_q, len_k):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Gets valid embeddings based on max length of RPR attention
+    ----------
+    """
+
+    len_e = Er.shape[0]
+    start = max(0, len_e - len_q)
+    return Er[start:, :]
+
+def _skew(qe):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Performs the skew optimized RPR computation (https://arxiv.org/abs/1809.04281)
+    ----------
+    """
+    sz = qe.shape[1]
+    mask = (torch.triu(torch.ones(sz, sz).to(qe.device)) == 1).float().flip(0)
+
+    qe = mask * qe
+    qe = F.pad(qe, (1,0, 0,0, 0,0))
+    qe = torch.reshape(qe, (qe.shape[0], qe.shape[2], qe.shape[1]))
+
+    srel = qe[:, 1:, :]
+    return srel
diff --git a/model/video_music_transformer.py b/model/video_music_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..146e8ba81c1e26d3b70e1f72287cc8a4793ff8a4
--- /dev/null
+++ b/model/video_music_transformer.py
@@ -0,0 +1,205 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+import numpy as np
+from utilities.constants import *
+from utilities.device import get_device
+from .positional_encoding import PositionalEncoding
+from .rpr import TransformerDecoderRPR, TransformerDecoderLayerRPR
+from datetime import datetime
+import json
+
+
+class VideoMusicTransformer(nn.Module):
+    def __init__(self, n_layers=6, num_heads=8, d_model=512, dim_feedforward=1024,
+                 dropout=0.1, max_sequence_midi =2048, max_sequence_video=300, max_sequence_chord=300, total_vf_dim = 0, rpr=False):
+        super(VideoMusicTransformer, self).__init__()
+        self.nlayers    = n_layers
+        self.nhead      = num_heads
+        self.d_model    = d_model
+        self.d_ff       = dim_feedforward
+        self.dropout    = dropout
+        self.max_seq_midi    = max_sequence_midi
+        self.max_seq_video    = max_sequence_video
+        self.max_seq_chord    = max_sequence_chord
+        self.rpr        = rpr
+
+        # Input embedding for video and music features
+        self.embedding = nn.Embedding(CHORD_SIZE, self.d_model)
+        self.embedding_root = nn.Embedding(CHORD_ROOT_SIZE, self.d_model)
+        self.embedding_attr = nn.Embedding(CHORD_ATTR_SIZE, self.d_model)
+        
+        self.total_vf_dim = total_vf_dim
+        self.Linear_vis     = nn.Linear(self.total_vf_dim, self.d_model)
+        self.Linear_chord     = nn.Linear(self.d_model+1, self.d_model)
+        
+        # Positional encoding
+        self.positional_encoding = PositionalEncoding(self.d_model, self.dropout, self.max_seq_chord)
+        self.positional_encoding_video = PositionalEncoding(self.d_model, self.dropout, self.max_seq_video)
+
+        # Add condition (minor or major)
+        self.condition_linear = nn.Linear(1, self.d_model)
+        
+        # Base transformer
+        if(not self.rpr):
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=self.nlayers, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff
+            )
+        # RPR Transformer
+        else:
+            decoder_norm = LayerNorm(self.d_model)
+            decoder_layer = TransformerDecoderLayerRPR(self.d_model, self.nhead, self.d_ff, self.dropout, er_len=self.max_seq_chord)
+            decoder = TransformerDecoderRPR(decoder_layer, self.nlayers, decoder_norm)
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=self.nlayers, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff, custom_decoder=decoder
+            )        
+        
+        self.Wout       = nn.Linear(self.d_model, CHORD_SIZE)
+        self.Wout_root       = nn.Linear(self.d_model, CHORD_ROOT_SIZE)
+        self.Wout_attr       = nn.Linear(self.d_model, CHORD_ATTR_SIZE)
+        self.softmax    = nn.Softmax(dim=-1)
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    
+    def forward(self, x, x_root, x_attr, feature_semantic_list, feature_key, feature_scene_offset, feature_motion, feature_emotion, mask=True):
+        if(mask is True):
+            mask = self.transformer.generate_square_subsequent_mask(x.shape[1]).to(self.device)
+        else:
+            mask = None
+        
+        x_root = self.embedding_root(x_root)
+        x_attr = self.embedding_attr(x_attr)
+        x = x_root + x_attr
+
+        feature_key_padded = torch.full((x.shape[0], x.shape[1], 1), feature_key.item())
+        feature_key_padded = feature_key_padded.to(self.device)
+        x = torch.cat([x, feature_key_padded], dim=-1)
+
+        xf = self.Linear_chord(x)
+
+        ### Video (SemanticList + SceneOffset + Motion + Emotion) (ENCODER) ###
+        vf_concat = feature_semantic_list[0].float()
+
+        for i in range(1, len(feature_semantic_list)):
+            vf_concat = torch.cat( (vf_concat, feature_semantic_list[i].float()), dim=2)            
+        
+        vf_concat = torch.cat([vf_concat, feature_scene_offset.unsqueeze(-1).float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+        vf_concat = torch.cat([vf_concat, feature_motion.unsqueeze(-1).float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+        vf_concat = torch.cat([vf_concat, feature_emotion.float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+        vf = self.Linear_vis(vf_concat)
+        
+        ### POSITIONAL ENCODING ###
+        
+        xf = xf.permute(1,0,2) # -> (max_seq-1, batch_size, d_model)
+        vf = vf.permute(1,0,2) # -> (max_seq_video, batch_size, d_model)
+        
+        xf = self.positional_encoding(xf)
+        vf = self.positional_encoding_video(vf)
+
+        ### TRANSFORMER ###
+        x_out = self.transformer(src=vf, tgt=xf, tgt_mask=mask)
+        x_out = x_out.permute(1,0,2)
+
+        if IS_SEPERATED:
+            y_root = self.Wout_root(x_out)
+            y_attr = self.Wout_attr(x_out)
+            del mask
+            return y_root, y_attr
+        else:
+            y = self.Wout(x_out)
+            del mask
+            return y
+    
+    def generate(self, feature_semantic_list = [], feature_key=None, feature_scene_offset=None, feature_motion=None, feature_emotion=None,
+                 primer=None, primer_root=None, primer_attr=None, target_seq_length=300, beam=0, 
+                 beam_chance=1.0, max_conseq_N = 0, max_conseq_chord = 2):
+        
+        assert (not self.training), "Cannot generate while in training mode"
+        print("Generating sequence of max length:", target_seq_length)
+
+        with open('dataset/vevo_meta/chord_inv.json') as json_file:
+            chordInvDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_root.json') as json_file:
+            chordRootDic = json.load(json_file)
+        with open('dataset/vevo_meta/chord_attr.json') as json_file:
+            chordAttrDic = json.load(json_file)
+
+        gen_seq = torch.full((1,target_seq_length), CHORD_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+        gen_seq_root = torch.full((1,target_seq_length), CHORD_ROOT_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+        gen_seq_attr = torch.full((1,target_seq_length), CHORD_ATTR_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+        
+        num_primer = len(primer)
+        gen_seq[..., :num_primer] = primer.type(TORCH_LABEL_TYPE).to(self.device)
+        gen_seq_root[..., :num_primer] = primer_root.type(TORCH_LABEL_TYPE).to(self.device)
+        gen_seq_attr[..., :num_primer] = primer_attr.type(TORCH_LABEL_TYPE).to(self.device)
+
+        cur_i = num_primer
+        while(cur_i < target_seq_length):
+            y = self.softmax( self.forward( gen_seq[..., :cur_i], gen_seq_root[..., :cur_i], gen_seq_attr[..., :cur_i], 
+                                           feature_semantic_list, feature_key, feature_scene_offset, feature_motion, feature_emotion) )[..., :CHORD_END]
+            
+            token_probs = y[:, cur_i-1, :]
+            if(beam == 0):
+                beam_ran = 2.0
+            else:
+                beam_ran = random.uniform(0,1)
+            if(beam_ran <= beam_chance):
+                token_probs = token_probs.flatten()
+                top_res, top_i = torch.topk(token_probs, beam)
+                beam_rows = top_i // CHORD_SIZE
+                beam_cols = top_i % CHORD_SIZE
+                gen_seq = gen_seq[beam_rows, :]
+                gen_seq[..., cur_i] = beam_cols
+            else:
+                # token_probs.shape : [1, 157] 
+                # 0: N, 1: C, ... , 156: B:maj7
+                # 157 chordEnd 158 padding
+                if max_conseq_N == 0:
+                    token_probs[0][0] = 0.0
+                isMaxChord = True
+                if cur_i >= max_conseq_chord :
+                    preChord = gen_seq[0][cur_i-1].item()      
+                    for k in range (1, max_conseq_chord):
+                        if preChord != gen_seq[0][cur_i-1-k].item():
+                            isMaxChord = False
+                else:
+                    isMaxChord = False
+                
+                if isMaxChord:
+                    preChord = gen_seq[0][cur_i-1].item()
+                    token_probs[0][preChord] = 0.0
+                
+                distrib = torch.distributions.categorical.Categorical(probs=token_probs)
+                next_token = distrib.sample()
+                gen_seq[:, cur_i] = next_token
+                gen_chord = chordInvDic[ str( next_token.item() ) ]
+                
+                chord_arr = gen_chord.split(":")
+                if len(chord_arr) == 1:
+                    chordRootID = chordRootDic[chord_arr[0]]
+                    chordAttrID = 1
+                    chordRootID = torch.tensor([chordRootID]).to(self.device)
+                    chordAttrID = torch.tensor([chordAttrID]).to(self.device)
+                    gen_seq_root[:, cur_i] = chordRootID
+                    gen_seq_attr[:, cur_i] = chordAttrID
+                elif len(chord_arr) == 2:
+                    chordRootID = chordRootDic[chord_arr[0]]
+                    chordAttrID = chordAttrDic[chord_arr[1]]
+                    chordRootID = torch.tensor([chordRootID]).to(self.device)
+                    chordAttrID = torch.tensor([chordAttrID]).to(self.device)
+                    gen_seq_root[:, cur_i] = chordRootID
+                    gen_seq_attr[:, cur_i] = chordAttrID
+                    
+                # Let the transformer decide to end if it wants to
+                if(next_token == CHORD_END):
+                    print("Model called end of sequence at:", cur_i, "/", target_seq_length)
+                    break
+            cur_i += 1
+            if(cur_i % 50 == 0):
+                print(cur_i, "/", target_seq_length)
+        return gen_seq[:, :cur_i]
+
diff --git a/model/video_regression.py b/model/video_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aaf4968ed76b5170afd3a2ba06bcfd76a129dd0
--- /dev/null
+++ b/model/video_regression.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+import numpy as np
+from utilities.constants import *
+from utilities.device import get_device
+from datetime import datetime
+
+import torch.nn.functional as F
+
+class VideoRegression(nn.Module):
+    def __init__(self, n_layers=2, d_model=64, dropout=0.1, max_sequence_video=300, total_vf_dim = 0, regModel="bilstm"):
+        super(VideoRegression, self).__init__()
+        self.nlayers    = n_layers
+        self.d_model    = d_model
+        self.dropout    = dropout
+        self.max_seq_video    = max_sequence_video
+        self.total_vf_dim = total_vf_dim
+        self.regModel = regModel
+
+        self.bilstm = nn.LSTM(self.total_vf_dim, self.d_model, self.nlayers, bidirectional=True)
+        self.bigru = nn.GRU(self.total_vf_dim, self.d_model, self.nlayers, bidirectional=True)
+        self.bifc = nn.Linear(self.d_model * 2, 2)
+
+        self.lstm = nn.LSTM(self.total_vf_dim, self.d_model, self.nlayers)
+        self.gru = nn.GRU(self.total_vf_dim, self.d_model, self.nlayers)
+        self.fc = nn.Linear(self.d_model, 2)
+
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+        
+    def forward(self, feature_semantic_list, feature_scene_offset, feature_motion, feature_emotion):
+        ### Video (SemanticList + SceneOffset + Motion + Emotion) (ENCODER) ###
+        vf_concat = feature_semantic_list[0].float()
+        for i in range(1, len(feature_semantic_list)):
+            vf_concat = torch.cat( (vf_concat, feature_semantic_list[i].float()), dim=2)            
+        
+        vf_concat = torch.cat([vf_concat, feature_scene_offset.unsqueeze(-1).float()], dim=-1) 
+        vf_concat = torch.cat([vf_concat, feature_motion.unsqueeze(-1).float()], dim=-1) 
+        vf_concat = torch.cat([vf_concat, feature_emotion.float()], dim=-1) 
+
+        vf_concat = vf_concat.permute(1,0,2)
+        vf_concat = F.dropout(vf_concat, p=self.dropout, training=self.training)
+
+        if self.regModel == "bilstm":
+            out, _ = self.bilstm(vf_concat)
+            out = out.permute(1,0,2)
+            out = self.bifc(out)
+        elif self.regModel == "bigru":
+            out, _ = self.bigru(vf_concat)
+            out = out.permute(1,0,2)
+            out = self.bifc(out)
+        elif self.regModel == "lstm":
+            out, _ = self.lstm(vf_concat)
+            out = out.permute(1,0,2)
+            out = self.fc(out)
+        elif self.regModel == "gru":
+            out, _ = self.gru(vf_concat)
+            out = out.permute(1,0,2)
+            out = self.fc(out)
+        return out
+        
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..61b2c6ce9ffb278c58aadd9cfe68cec1ae3ba1aa
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,2 @@
+ffmpeg
+fluidsynth
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab0c3868c755f2078a1e8e382b979cd54c792751
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,24 @@
+torchvision==0.9.0
+torch==1.8.0
+clip @ git+https://github.com/openai/CLIP.git
+Cython==3.0.5
+numpy==1.19.5
+coloredlogs==15.0.1
+ffmpeg_python==0.2.0
+ftfy==6.1.1
+matplotlib==3.5.3
+midi2audio==0.1.1
+MIDIUtil==1.2.1
+moviepy==1.0.3
+music21==7.3.3
+opencv_python==4.7.0.72
+pandas==1.3.5
+Pillow==8.4.0
+pretty_midi==0.2.9
+pydub==0.25.1
+regex==2022.10.31
+scenedetect==0.6.1
+scikit_learn==1.0.2
+scipy==1.7.3
+gradio==4.7.1
+pyfluidsynth
diff --git a/saved_models/AMT/README.md b/saved_models/AMT/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..84647b908c040cd170233e1b0c1289fd238dfa49
--- /dev/null
+++ b/saved_models/AMT/README.md
@@ -0,0 +1 @@
+put pickle files in this directory
diff --git a/saved_models/AMT/best_loss_weights.pickle b/saved_models/AMT/best_loss_weights.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..25d01b2be9250cc3af42111c82e7d9b2234203f6
--- /dev/null
+++ b/saved_models/AMT/best_loss_weights.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:859f0fac92c6d4ac84446983cd138ca8d625a41e1854edbd86ea29a14f0aad28
+size 131375779
diff --git a/saved_models/AMT/best_rmse_weights.pickle b/saved_models/AMT/best_rmse_weights.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..f6f6e3936b81fff9856628aa8d79b5c8292ca264
--- /dev/null
+++ b/saved_models/AMT/best_rmse_weights.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3680851df4f8bb7902539bc10b3025eaa7162410826c164b4aec4d44a8c19818
+size 5463439
diff --git a/third_party/midi_processor/__pycache__/processor.cpython-37.pyc b/third_party/midi_processor/__pycache__/processor.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..772264dd0765aa59c39565285b5ba9da3d17753c
Binary files /dev/null and b/third_party/midi_processor/__pycache__/processor.cpython-37.pyc differ
diff --git a/third_party/midi_processor/__pycache__/processor.cpython-38.pyc b/third_party/midi_processor/__pycache__/processor.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ae0acfff0e1fbcac7a233fbfef4811ef831b9b
Binary files /dev/null and b/third_party/midi_processor/__pycache__/processor.cpython-38.pyc differ
diff --git a/third_party/midi_processor/processor.py b/third_party/midi_processor/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6531853b3888c0b50e20fc72ca207dfba5cf49a
--- /dev/null
+++ b/third_party/midi_processor/processor.py
@@ -0,0 +1,261 @@
+import pretty_midi
+
+RANGE_NOTE_ON = 128
+RANGE_NOTE_OFF = 128
+RANGE_VEL = 32
+RANGE_TIME_SHIFT = 100
+
+START_IDX = {
+    'note_on': 0,
+    'note_off': RANGE_NOTE_ON,
+    'time_shift': RANGE_NOTE_ON + RANGE_NOTE_OFF,
+    'velocity': RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT
+}
+
+class SustainAdapter:
+    def __init__(self, time, type):
+        self.start =  time
+        self.type = type
+
+
+class SustainDownManager:
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+        self.managed_notes = []
+        self._note_dict = {} # key: pitch, value: note.start
+
+    def add_managed_note(self, note: pretty_midi.Note):
+        self.managed_notes.append(note)
+
+    def transposition_notes(self):
+        for note in reversed(self.managed_notes):
+            try:
+                note.end = self._note_dict[note.pitch]
+            except KeyError:
+                note.end = max(self.end, note.end)
+            self._note_dict[note.pitch] = note.start
+
+
+# Divided note by note_on, note_off
+class SplitNote:
+    def __init__(self, type, time, value, velocity):
+        ## type: note_on, note_off
+        self.type = type
+        self.time = time
+        self.velocity = velocity
+        self.value = value
+
+    def __repr__(self):
+        return '<[SNote] time: {} type: {}, value: {}, velocity: {}>'\
+            .format(self.time, self.type, self.value, self.velocity)
+
+
+class Event:
+    def __init__(self, event_type, value):
+        self.type = event_type
+        self.value = value
+
+    def __repr__(self):
+        return '<Event type: {}, value: {}>'.format(self.type, self.value)
+
+    def to_int(self):
+        return START_IDX[self.type] + self.value
+
+    @staticmethod
+    def from_int(int_value):
+        info = Event._type_check(int_value)
+        return Event(info['type'], info['value'])
+
+    @staticmethod
+    def _type_check(int_value):
+        range_note_on = range(0, RANGE_NOTE_ON)
+        range_note_off = range(RANGE_NOTE_ON, RANGE_NOTE_ON+RANGE_NOTE_OFF)
+        range_time_shift = range(RANGE_NOTE_ON+RANGE_NOTE_OFF,RANGE_NOTE_ON+RANGE_NOTE_OFF+RANGE_TIME_SHIFT)
+
+        valid_value = int_value
+
+        if int_value in range_note_on:
+            return {'type': 'note_on', 'value': valid_value}
+        elif int_value in range_note_off:
+            valid_value -= RANGE_NOTE_ON
+            return {'type': 'note_off', 'value': valid_value}
+        elif int_value in range_time_shift:
+            valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF)
+            return {'type': 'time_shift', 'value': valid_value}
+        else:
+            valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT)
+            return {'type': 'velocity', 'value': valid_value}
+
+
+def _divide_note(notes):
+    result_array = []
+    notes.sort(key=lambda x: x.start)
+
+    for note in notes:
+        on = SplitNote('note_on', note.start, note.pitch, note.velocity)
+        off = SplitNote('note_off', note.end, note.pitch, None)
+        result_array += [on, off]
+    return result_array
+
+
+def _merge_note(snote_sequence):
+    note_on_dict = {}
+    result_array = []
+
+    for snote in snote_sequence:
+        # print(note_on_dict)
+        if snote.type == 'note_on':
+            note_on_dict[snote.value] = snote
+        elif snote.type == 'note_off':
+            try:
+                on = note_on_dict[snote.value]
+                off = snote
+                if off.time - on.time == 0:
+                    continue
+                result = pretty_midi.Note(on.velocity, snote.value, on.time, off.time)
+                result_array.append(result)
+            except:
+                print('info removed pitch: {}'.format(snote.value))
+    return result_array
+
+
+def _snote2events(snote: SplitNote, prev_vel: int):
+    result = []
+    if snote.velocity is not None:
+        modified_velocity = snote.velocity // 4
+        if prev_vel != modified_velocity:
+            result.append(Event(event_type='velocity', value=modified_velocity))
+    result.append(Event(event_type=snote.type, value=snote.value))
+    return result
+
+
+def _event_seq2snote_seq(event_sequence):
+    timeline = 0
+    velocity = 0
+    snote_seq = []
+
+    for event in event_sequence:
+        if event.type == 'time_shift':
+            timeline += ((event.value+1) / 100)
+        if event.type == 'velocity':
+            velocity = event.value * 4
+        else:
+            snote = SplitNote(event.type, timeline, event.value, velocity)
+            snote_seq.append(snote)
+    return snote_seq
+
+
+def _make_time_sift_events(prev_time, post_time):
+    time_interval = int(round((post_time - prev_time) * 100))
+    results = []
+    while time_interval >= RANGE_TIME_SHIFT:
+        results.append(Event(event_type='time_shift', value=RANGE_TIME_SHIFT-1))
+        time_interval -= RANGE_TIME_SHIFT
+    if time_interval == 0:
+        return results
+    else:
+        return results + [Event(event_type='time_shift', value=time_interval-1)]
+
+
+def _control_preprocess(ctrl_changes):
+    sustains = []
+
+    manager = None
+    for ctrl in ctrl_changes:
+        if ctrl.value >= 64 and manager is None:
+            # sustain down
+            manager = SustainDownManager(start=ctrl.time, end=None)
+        elif ctrl.value < 64 and manager is not None:
+            # sustain up
+            manager.end = ctrl.time
+            sustains.append(manager)
+            manager = None
+        elif ctrl.value < 64 and len(sustains) > 0:
+            sustains[-1].end = ctrl.time
+    return sustains
+
+
+def _note_preprocess(susteins, notes):
+    note_stream = []
+
+    if susteins:    # if the midi file has sustain controls
+        for sustain in susteins:
+            for note_idx, note in enumerate(notes):
+                if note.start < sustain.start:
+                    note_stream.append(note)
+                elif note.start > sustain.end:
+                    notes = notes[note_idx:]
+                    sustain.transposition_notes()
+                    break
+                else:
+                    sustain.add_managed_note(note)
+
+        for sustain in susteins:
+            note_stream += sustain.managed_notes
+    
+    else:       # else, just push everything into note stream
+        for note_idx, note in enumerate(notes):
+            note_stream.append(note)
+
+    note_stream.sort(key= lambda x: x.start)
+    return note_stream
+
+
+def encode_midi(file_path):
+    events = []
+    notes = []
+    mid = pretty_midi.PrettyMIDI(midi_file=file_path)
+
+    for inst in mid.instruments:
+        inst_notes = inst.notes
+        # ctrl.number is the number of sustain control. If you want to know abour the number type of control,
+        # see https://www.midi.org/specifications-old/item/table-3-control-change-messages-data-bytes-2
+        ctrls = _control_preprocess([ctrl for ctrl in inst.control_changes if ctrl.number == 64])
+        notes += _note_preprocess(ctrls, inst_notes)
+
+    dnotes = _divide_note(notes)
+    # print(dnotes)
+    dnotes.sort(key=lambda x: x.time)
+    # print('sorted:')
+    # print(dnotes)
+    cur_time = 0
+    cur_vel = 0
+    for snote in dnotes:
+        events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+        events += _snote2events(snote=snote, prev_vel=cur_vel)
+        # events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+        cur_time = snote.time
+        cur_vel = snote.velocity
+
+    return [e.to_int() for e in events]
+
+def decode_midi(idx_array, file_path=None):
+    event_sequence = [Event.from_int(idx) for idx in idx_array]
+    # print(event_sequence)
+    snote_seq = _event_seq2snote_seq(event_sequence)
+    note_seq = _merge_note(snote_seq)
+    note_seq.sort(key=lambda x:x.start)
+
+    mid = pretty_midi.PrettyMIDI()
+    # if want to change instument, see https://www.midi.org/specifications/item/gm-level-1-sound-set
+    instument = pretty_midi.Instrument(1, False, "Developed By Jaeyong Kang")
+    instument.notes = note_seq
+
+    mid.instruments.append(instument)
+    if file_path is not None:
+        mid.write(file_path)
+    return mid
+
+# if __name__ == '__main__':
+#     encoded = encode_midi('bin/ADIG04.mid')
+#     print(encoded)
+#     decided = decode_midi(encoded,file_path='bin/test.mid')
+
+#     ins = pretty_midi.PrettyMIDI('bin/ADIG04.mid')
+#     print(ins)
+#     print(ins.instruments[0])
+#     for i in ins.instruments:
+#         print(i.control_changes)
+#         print(i.notes)
+
diff --git a/utilities/__init__.py b/utilities/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utilities/__pycache__/__init__.cpython-37.pyc b/utilities/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1c917f3725cc057a73fb1c8c56035d3f72f3df3
Binary files /dev/null and b/utilities/__pycache__/__init__.cpython-37.pyc differ
diff --git a/utilities/__pycache__/__init__.cpython-38.pyc b/utilities/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413f568d22a4d6720f04f2fd21bac47445825776
Binary files /dev/null and b/utilities/__pycache__/__init__.cpython-38.pyc differ
diff --git a/utilities/__pycache__/argument_funcs.cpython-37.pyc b/utilities/__pycache__/argument_funcs.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3d8762b8a135a44f00bb9317ba234f5528c3154
Binary files /dev/null and b/utilities/__pycache__/argument_funcs.cpython-37.pyc differ
diff --git a/utilities/__pycache__/chord_to_midi.cpython-37.pyc b/utilities/__pycache__/chord_to_midi.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29aea4e5b863c4c6cb365b6ac95a75c2ed930d2c
Binary files /dev/null and b/utilities/__pycache__/chord_to_midi.cpython-37.pyc differ
diff --git a/utilities/__pycache__/chord_to_midi.cpython-38.pyc b/utilities/__pycache__/chord_to_midi.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f66382aa7a37bf413c1f523e77cd0d5b17ad32a
Binary files /dev/null and b/utilities/__pycache__/chord_to_midi.cpython-38.pyc differ
diff --git a/utilities/__pycache__/constants.cpython-37.pyc b/utilities/__pycache__/constants.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..152e8575802dc8868fa801e9fba3364833e38729
Binary files /dev/null and b/utilities/__pycache__/constants.cpython-37.pyc differ
diff --git a/utilities/__pycache__/constants.cpython-38.pyc b/utilities/__pycache__/constants.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc612d3513fae337d93d7d0dbc3234e792e663ed
Binary files /dev/null and b/utilities/__pycache__/constants.cpython-38.pyc differ
diff --git a/utilities/__pycache__/device.cpython-37.pyc b/utilities/__pycache__/device.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b3105df0fb1ed22b15ba7e1dc8b77112dbeba3c
Binary files /dev/null and b/utilities/__pycache__/device.cpython-37.pyc differ
diff --git a/utilities/__pycache__/device.cpython-38.pyc b/utilities/__pycache__/device.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc932b1e2d496ed28d13778dd032e4ec6d5c5f8
Binary files /dev/null and b/utilities/__pycache__/device.cpython-38.pyc differ
diff --git a/utilities/argument_funcs.py b/utilities/argument_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aea43e4d23f0bc550cacd5a364a119af22d7999
--- /dev/null
+++ b/utilities/argument_funcs.py
@@ -0,0 +1,275 @@
+import argparse
+from .constants import *
+
+version = VERSION
+split_ver = SPLIT_VER
+split_path = "split_" + split_ver
+
+def parse_train_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+    
+    parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+    parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+
+    parser.add_argument("-output_dir", type=str, default="./saved_models", help="Folder to save model weights. Saves one every epoch")
+    
+    parser.add_argument("-weight_modulus", type=int, default=1, help="How often to save epoch weights (ex: value of 10 means save every 10 epochs)")
+    parser.add_argument("-print_modulus", type=int, default=1, help="How often to print train results for a batch (batch loss, learn rate, etc.)")
+    parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+    parser.add_argument("--no_tensorboard", action="store_true", help="Turns off tensorboard result reporting")
+    parser.add_argument("-continue_weights", type=str, default=None, help="Model weights to continue training based on")
+    parser.add_argument("-continue_epoch", type=int, default=None, help="Epoch the continue_weights model was at")
+    parser.add_argument("-lr", type=float, default=None, help="Constant learn rate. Leave as None for a custom scheduler.")
+    parser.add_argument("-ce_smoothing", type=float, default=None, help="Smoothing parameter for smoothed cross entropy loss (defaults to no smoothing)")
+    parser.add_argument("-batch_size", type=int, default=1, help="Batch size to use")
+    parser.add_argument("-epochs", type=int, default=5, help="Number of epochs to use")
+
+    parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+    parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+    parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum video sequence to consider")
+
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+    parser.add_argument("-dropout", type=float, default=0.1, help="Dropout rate")
+
+    parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+    if IS_VIDEO:
+        parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+    else:
+        parser.add_argument("-vis_models", type=str, default="", help="...")
+
+    parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+    parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+    return parser.parse_args()
+
+def print_train_args(args):
+    print(SEPERATOR)
+    
+    print("dataset_dir:", args.dataset_dir )
+    
+    print("input_dir_music:", args.input_dir_music)
+    print("input_dir_video:", args.input_dir_video)
+
+    print("output_dir:", args.output_dir)
+
+    print("weight_modulus:", args.weight_modulus)
+    print("print_modulus:", args.print_modulus)
+    print("")
+    print("n_workers:", args.n_workers)
+    print("force_cpu:", args.force_cpu)
+    print("tensorboard:", not args.no_tensorboard)
+    print("")
+    print("continue_weights:", args.continue_weights)
+    print("continue_epoch:", args.continue_epoch)
+    print("")
+    print("lr:", args.lr)
+    print("ce_smoothing:", args.ce_smoothing)
+    print("batch_size:", args.batch_size)
+    print("epochs:", args.epochs)
+    print("")
+    print("rpr:", args.rpr)
+
+    print("max_sequence_midi:", args.max_sequence_midi)
+    print("max_sequence_video:", args.max_sequence_video)
+    print("max_sequence_chord:", args.max_sequence_chord)
+    
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print("dropout:", args.dropout)
+    print("is_video:", args.is_video)
+
+    print(SEPERATOR)
+    print("")
+
+def parse_eval_args():
+    if IS_VIDEO:
+        modelpath = "./saved_models/AMT/best_acc_weights.pickle"
+        # modelpath = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results/best_loss_weights.pickle"
+    else:
+        modelpath = "./saved_models/"+version+ "/no_video/results/best_acc_weights.pickle"
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+    
+    parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+    parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+    
+    parser.add_argument("-model_weights", type=str, default= modelpath, help="Pickled model weights file saved with torch.save and model.state_dict()")
+    
+    parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+    parser.add_argument("-batch_size", type=int, default=1, help="Batch size to use")
+    
+    parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+    parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+    parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum video sequence to consider")
+
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+
+    parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+    if IS_VIDEO:
+        parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+    else:
+        parser.add_argument("-vis_models", type=str, default="", help="...")
+
+    parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+    parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+    return parser.parse_args()
+
+def print_eval_args(args):
+    print(SEPERATOR)
+    print("input_dir_music:", args.input_dir_music)
+    print("input_dir_video:", args.input_dir_video)
+
+    print("model_weights:", args.model_weights)
+    print("n_workers:", args.n_workers)
+    print("force_cpu:", args.force_cpu)
+    print("")
+    print("batch_size:", args.batch_size)
+    print("")
+    print("rpr:", args.rpr)
+    
+    print("max_sequence_midi:", args.max_sequence_midi)
+    print("max_sequence_video:", args.max_sequence_video)
+    print("max_sequence_chord:", args.max_sequence_chord)
+    
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print(SEPERATOR)
+    print("")
+
+# parse_generate_args
+def parse_generate_args():
+    parser = argparse.ArgumentParser()
+    outputpath = "./output_vevo/"+version
+    if IS_VIDEO:
+        modelpath = "./saved_models/AMT/best_loss_weights.pickle"
+        modelpathReg = "./saved_models/AMT/best_rmse_weights.pickle"
+        # modelpath = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results/best_acc_weights.pickle"
+        # modelpathReg = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results_regression_bigru/best_rmse_weights.pickle"
+    else:
+        modelpath = "./saved_models/"+version+ "/no_video/results/best_loss_weights.pickle"
+        modelpathReg = None
+
+    parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+    
+    parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+    parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+
+    parser.add_argument("-output_dir", type=str, default= outputpath, help="Folder to write generated midi to")
+
+    parser.add_argument("-primer_file", type=str, default=None, help="File path or integer index to the evaluation dataset. Default is to select a random index.")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+
+    parser.add_argument("-target_seq_length_midi", type=int, default=1024, help="Target length you'd like the midi to be")
+    parser.add_argument("-target_seq_length_chord", type=int, default=300, help="Target length you'd like the midi to be")
+    
+    parser.add_argument("-num_prime_midi", type=int, default=256, help="Amount of messages to prime the generator with")
+    parser.add_argument("-num_prime_chord", type=int, default=30, help="Amount of messages to prime the generator with")    
+    parser.add_argument("-model_weights", type=str, default=modelpath, help="Pickled model weights file saved with torch.save and model.state_dict()")
+    parser.add_argument("-modelReg_weights", type=str, default=modelpathReg, help="Pickled model weights file saved with torch.save and model.state_dict()")
+
+    parser.add_argument("-beam", type=int, default=0, help="Beam search k. 0 for random probability sample and 1 for greedy")
+
+    parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+    parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+    parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum chord sequence to consider")
+
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+
+    parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+    if IS_VIDEO:
+        parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+    else:
+        parser.add_argument("-vis_models", type=str, default="", help="...")
+
+    parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+    parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+    parser.add_argument("-test_id", type=str, default=None, help="Dimension of the feedforward layer")
+
+    return parser.parse_args()
+
+def print_generate_args(args):
+    
+    print(SEPERATOR)
+    print("input_dir_music:", args.input_dir_music)
+    print("input_dir_video:", args.input_dir_video)
+
+    print("output_dir:", args.output_dir)
+    print("primer_file:", args.primer_file)
+    print("force_cpu:", args.force_cpu)
+    print("")
+
+    print("target_seq_length_midi:", args.target_seq_length_midi)
+    print("target_seq_length_chord:", args.target_seq_length_chord)
+    
+    print("num_prime_midi:", args.num_prime_midi)
+    print("num_prime_chord:", args.num_prime_chord)
+
+    print("model_weights:", args.model_weights)
+    print("beam:", args.beam)
+    print("")
+    print("rpr:", args.rpr)
+    
+    print("max_sequence_midi:", args.max_sequence_midi)
+    print("max_sequence_video:", args.max_sequence_video)
+    print("max_sequence_chord:", args.max_sequence_chord)
+    
+
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print("")
+    print("test_id:", args.test_id)
+
+    print(SEPERATOR)
+    print("")
+
+# write_model_params
+def write_model_params(args, output_file):
+    o_stream = open(output_file, "w")
+
+    o_stream.write("rpr: " + str(args.rpr) + "\n")
+    o_stream.write("lr: " + str(args.lr) + "\n")
+    o_stream.write("ce_smoothing: " + str(args.ce_smoothing) + "\n")
+    o_stream.write("batch_size: " + str(args.batch_size) + "\n")
+
+    o_stream.write("max_sequence_midi: " + str(args.max_sequence_midi) + "\n")
+    o_stream.write("max_sequence_video: " + str(args.max_sequence_video) + "\n")
+    o_stream.write("max_sequence_chord: " + str(args.max_sequence_chord) + "\n")
+    
+    o_stream.write("n_layers: " + str(args.n_layers) + "\n")
+    o_stream.write("num_heads: " + str(args.num_heads) + "\n")
+    o_stream.write("d_model: " + str(args.d_model) + "\n")
+    o_stream.write("dim_feedforward: " + str(args.dim_feedforward) + "\n")
+    o_stream.write("dropout: " + str(args.dropout) + "\n")
+
+    o_stream.write("is_video: " + str(args.is_video) + "\n")
+    o_stream.write("vis_models: " + str(args.vis_models) + "\n")
+    o_stream.write("input_dir_music: " + str(args.input_dir_music) + "\n")
+    o_stream.write("input_dir_video: " + str(args.input_dir_video) + "\n")
+
+    o_stream.close()
diff --git a/utilities/chord_to_midi.py b/utilities/chord_to_midi.py
new file mode 100644
index 0000000000000000000000000000000000000000..393a43e039905a4d39f71d10371b796280713c9d
--- /dev/null
+++ b/utilities/chord_to_midi.py
@@ -0,0 +1,316 @@
+# ezchord - convert complex chord names to midi notes
+
+import sys
+import math
+import argparse
+from enum import Enum, auto
+from midiutil import MIDIFile
+
+class Mode(Enum):
+    DIM = auto()
+    MIN = auto()
+    MAJ = auto()
+    DOM = auto()
+    AUG = auto()
+    SUS2 = auto()
+    SUS = auto()
+    FIVE = auto()
+
+TEXT_TO_MODE = {
+    "maj":  Mode.MAJ,
+    "dim":  Mode.DIM,
+    "o":    Mode.DIM,
+    "min":  Mode.MIN,
+    "m":    Mode.MIN,
+    "-":    Mode.MIN,
+    "aug":  Mode.AUG,
+    "+":    Mode.AUG,
+    "sus2":  Mode.SUS2,
+    "sus":  Mode.SUS,
+    "5":    Mode.FIVE,
+    "five": Mode.FIVE
+}
+
+MODE_TO_SHIFT = {
+    Mode.MAJ:   {3:0, 5:0},
+    Mode.DOM:   {3:0, 5:0},
+    Mode.DIM:   {3:-1, 5:-1},
+    Mode.MIN:   {3:-1, 5:0},
+    Mode.AUG:   {3:0, 5:1},
+    Mode.SUS2:  {3:-2, 5:0},
+    Mode.SUS:   {3:1, 5:0},
+    Mode.FIVE:  {3:3, 5:0},
+}
+
+NOTE_TO_PITCH = {
+    "a": 9,
+    "b": 11,
+    "c": 12,
+    "d": 14,
+    "e": 16,
+    "f": 17,
+    "g": 19
+}
+
+PITCH_TO_NOTE = {}
+
+for note, pitch in NOTE_TO_PITCH.items():
+    PITCH_TO_NOTE[pitch] = note
+
+RM_TO_PITCH = {
+    "vii":  11,
+    "iii":  4,
+    "vi":   9,
+    "iv":   5,
+    "ii":   2,
+    "i":    0,
+    "v":    7
+}
+
+ACC_TO_SHIFT = {
+    "b": -1,
+    "#": 1
+}
+
+SCALE_DEGREE_SHIFT = {
+    1: 0,
+    2: 2,
+    3: 4,
+    4: 5,
+    5: 7,
+    6: 9,
+    7: 11
+}
+
+def getNumber(string):
+    numStr = ""
+    
+    for char in string:
+        if char.isdigit():
+            numStr += char
+    
+    if len(numStr) > 0:
+        return int(numStr)
+    
+    return
+
+def textToPitch(text, key = "c", voice = True):
+    text = text.lower()
+    isLetter = text[0] in NOTE_TO_PITCH.keys()
+
+    if isLetter:
+        pitch = NOTE_TO_PITCH[text[0]]
+    else:
+        for rm in RM_TO_PITCH.keys():
+            if rm in text:
+                pitch = RM_TO_PITCH[rm] + textToPitch(key)
+                isRomanNumeral = True
+                break
+
+    for i in range(1 if isLetter else 0, len(text)):
+        if text[i] in ACC_TO_SHIFT.keys():
+            pitch += ACC_TO_SHIFT[text[i]]   
+    
+    return pitch
+
+def pitchToText(pitch):
+    octave = math.floor(pitch / 12)
+    pitch = pitch % 12
+    pitch = pitch + (12 if pitch < 9 else 0)
+    accidental = ""
+
+    if not (pitch in PITCH_TO_NOTE.keys()):
+        pitch = (pitch + 1) % 12
+        pitch = pitch + (12 if pitch < 9 else 0)
+        accidental = "b"
+    
+    return PITCH_TO_NOTE[pitch].upper() + accidental + str(octave)
+
+def degreeToShift(deg):
+    return SCALE_DEGREE_SHIFT[(deg - 1) % 7 + 1] + math.floor(deg / 8) * 12
+
+def voice(chords):
+    center = 0
+    voiced_chords = []
+    chord_ct = 0 
+    pChord = None
+
+    for i, currChord in enumerate(chords):
+
+        if len(currChord) == 0:
+            voiced_chords.append( [] )
+            continue
+        else:
+            if chord_ct == 0:
+                voiced_chords.append( currChord )
+                chord_ct += 1
+                center = currChord[1] + 3
+                pChord = currChord
+                continue
+
+        prevChord = pChord
+        voiced_chord = []
+
+        for i_, currNote in enumerate(currChord):
+            # Skip bass note
+            if i_ == 0:
+                prevNote = prevChord[0]
+                if abs(currNote - prevNote) > 7:
+                    if currNote < prevNote and abs(currNote + 12 - prevNote) < abs(currNote - prevNote):
+                        bestVoicing = currNote + 12
+                    elif currNote > prevNote and abs(currNote - 12 - prevNote) < abs(currNote - prevNote):
+                        bestVoicing = currNote - 12
+                else:
+                    bestVoicing = currNote 
+
+                voiced_chord.append(bestVoicing)
+                continue
+            
+            bestNeighbor = None
+            allowance = -1
+
+            while bestNeighbor == None:
+                allowance += 1
+                for i__, prevNote in enumerate(prevChord):
+                    if i__ == 0:
+                        continue
+                    
+                    if (
+                        abs(currNote - prevNote) % 12 == allowance
+                        or abs(currNote - prevNote) % 12 == 12 - allowance
+                    ):
+                        bestNeighbor = prevNote
+                        break
+
+            if currNote <= bestNeighbor:
+                bestVoicing = currNote + math.floor((bestNeighbor - currNote + 6) / 12) * 12
+            else:
+                bestVoicing = currNote + math.ceil((bestNeighbor - currNote - 6) / 12) * 12
+
+            bestVoicing = bestVoicing if (abs(bestVoicing - center) <= 8 or allowance > 2) else currNote
+            voiced_chord.append(bestVoicing)
+            
+
+        voiced_chord.sort()
+        voiced_chords.append(voiced_chord)
+        pChord = voiced_chord
+    
+    return voiced_chords
+
+class Chord:
+    def __init__(self, string):
+        self.string = string
+        self.degrees = {}
+    
+        string += " "
+        self.split = []
+        sect = ""
+
+        notes = list(NOTE_TO_PITCH.keys())
+        rms = list(RM_TO_PITCH.keys())
+        accs = list(ACC_TO_SHIFT.keys())
+        modes = list(TEXT_TO_MODE.keys())
+
+        rootAdded = False
+        modeAdded = False
+
+        isRomanNumeral = False
+        isSlashChord = False
+        isMaj7 = False
+
+        for i in range(0, len(string) - 1):
+            sect += string[i]
+            currChar = string[i].lower()
+            nextChar = string[i+1].lower()
+
+            rootFound = not rootAdded and (currChar in notes+rms+accs and not nextChar in rms+accs) 
+            modeFound = False
+            numFound = (currChar.isdigit() and not nextChar.isdigit())
+
+            if (
+                (i == len(string) - 2)
+                or rootFound
+                or numFound
+                or nextChar == "/"
+                or currChar == ")"
+            ):
+                if rootFound:
+                    self.root = sect
+                    rootAdded = True
+
+                    isRomanNumeral = self.root in rms
+                elif sect[0] == "/":
+                    # case for 6/9 chords
+                    if sect[1] == "9":
+                        self.degrees[9] = 0
+                    else:
+                        isSlashChord = True
+                        self.bassnote = sect[1:len(sect)]
+                else:
+                    if not modeAdded:
+                        for mode in modes:
+                            modeFound = mode in sect[0:len(mode)]
+                            if modeFound:
+                                self.mode = TEXT_TO_MODE[mode]
+                                modeAdded = True
+                                break
+                    
+                    if not modeAdded:
+                        if not isRomanNumeral and str(getNumber(sect)) == sect:
+                            self.mode = Mode.DOM
+                            modeFound = True
+                            modeAdded = True
+                    
+                    deg = getNumber(sect)
+                    if deg != None:
+                        shift = 0
+
+                        for char in sect:
+                            if char == "#":
+                                shift += 1
+                            elif char == "b":
+                                shift -= 1
+
+                        if (not modeFound) or deg % 2 == 0:
+                            self.degrees[deg] = shift
+                        elif deg >= 7:
+                            for i in range(7, deg+1):
+                                if i % 2 != 0:
+                                    self.degrees[i] = shift
+
+                self.split.append(sect)
+                sect = ""
+
+        if not modeAdded:
+            # Case for minor roman numeral chords
+            if self.root in rms and self.root == self.root.lower():
+                self.mode = Mode.MIN
+            else:
+                self.mode = Mode.DOM
+        
+        if not isSlashChord:
+            self.bassnote = self.root
+
+        for sect in self.split:
+            isMaj7 = ("maj" in sect) or isMaj7
+        
+        if (7 in self.degrees.keys()) and not isMaj7:
+            self.degrees[7] = -1
+    
+    def getMIDI(self, key="c", octave=4):
+        notes = {}
+
+        notes[0] = textToPitch(self.bassnote, key) - 12
+
+        root = textToPitch(self.root, key)
+        notes[1] = root
+        notes[3] = root + degreeToShift(3) + MODE_TO_SHIFT[self.mode][3]
+        notes[5] = root + degreeToShift(5) + MODE_TO_SHIFT[self.mode][5]
+
+        for deg in self.degrees.keys():
+            notes[deg] = root + degreeToShift(deg) + self.degrees[deg]
+
+        for deg in notes.keys():
+            notes[deg] += 12 * octave
+
+        return list(notes.values())
diff --git a/utilities/constants.py b/utilities/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4a44d1f6aef4d591494a7bcbb014579f8014d7
--- /dev/null
+++ b/utilities/constants.py
@@ -0,0 +1,97 @@
+import torch
+from third_party.midi_processor.processor import RANGE_NOTE_ON, RANGE_NOTE_OFF, RANGE_VEL, RANGE_TIME_SHIFT
+
+#Proposed (AMT l0.4)
+# VERSION = "v27_video_rpr_nosep_l0.4"
+VERSION = "AMT"
+
+#Best Baseline (MT)
+# VERSION = "v27_novideo_rpr_nosep"
+
+IS_SEPERATED = False # True : seperated chord quality and root output
+RPR = True
+IS_VIDEO = True
+
+GEN_MODEL = "Video Music Transformer"
+# LSTM
+# Transformer
+# Music Transformer
+# Video Music Transformer
+
+LOSS_LAMBDA = 0.4 # lamda * chord  +  ( 1-lamda ) * emotion
+
+EMOTION_THRESHOLD = 0.80
+
+VIS_MODELS = "2d/clip_l14p"
+SPLIT_VER = "v1"
+
+MUSIC_TYPE = "lab_v2_norm"
+# - midi_prep
+# - lab
+# - lab_v2
+# - lab_v2_norm
+# ----------------------------------------- #
+
+VIS_ABBR_DIC = {
+    "2d/clip_l14p" : "clip_l14p", # NEW
+}
+
+vis_arr = VIS_MODELS.split(" ")
+vis_arr.sort()
+vis_abbr_path = ""
+for v in vis_arr:
+    vis_abbr_path = vis_abbr_path + "_" + VIS_ABBR_DIC[v]
+vis_abbr_path = vis_abbr_path[1:]
+
+VIS_MODELS_PATH = vis_abbr_path
+VIS_MODELS_SORTED = " ".join(vis_arr)
+
+# CHORD
+CHORD_END               = 157
+CHORD_PAD               = CHORD_END + 1 
+CHORD_SIZE              = CHORD_PAD + 1
+
+# CHORD_ROOT
+CHORD_ROOT_END               = 13
+CHORD_ROOT_PAD               = CHORD_ROOT_END + 1
+CHORD_ROOT_SIZE              = CHORD_ROOT_PAD + 1
+
+# CHORD_ATTR
+CHORD_ATTR_END               = 14
+CHORD_ATTR_PAD               = CHORD_ATTR_END + 1
+CHORD_ATTR_SIZE              = CHORD_ATTR_PAD + 1
+
+# SEMANTIC
+SEMANTIC_PAD               = 0.0 
+
+# SCENE_OFFSET
+SCENE_OFFSET_PAD        = 0.0 
+
+# MOTION
+MOTION_PAD        = 0.0 
+
+# EMOTION
+EMOTION_PAD        = 0.0 
+
+# NOTE_DENSITY
+NOTE_DENSITY_PAD        = 0.0 
+
+# LOUDNESS
+LOUDNESS_PAD        = 0.0 
+
+# OTHER
+SEPERATOR               = "========================="
+ADAM_BETA_1             = 0.9
+ADAM_BETA_2             = 0.98
+ADAM_EPSILON            = 10e-9
+LR_DEFAULT_START        = 1.0
+SCHEDULER_WARMUP_STEPS  = 4000
+TORCH_FLOAT             = torch.float32
+TORCH_INT               = torch.int32
+TORCH_LABEL_TYPE        = torch.long
+PREPEND_ZEROS_WIDTH     = 4
+
+# MIDI
+TOKEN_END               = RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_VEL + RANGE_TIME_SHIFT
+TOKEN_PAD               = TOKEN_END + 1
+VOCAB_SIZE              = TOKEN_PAD + 1
diff --git a/utilities/device.py b/utilities/device.py
new file mode 100755
index 0000000000000000000000000000000000000000..61f0cf29ef9c1698842ef9ebcda48581fa165c34
--- /dev/null
+++ b/utilities/device.py
@@ -0,0 +1,67 @@
+# For all things related to devices
+#### ONLY USE PROVIDED FUNCTIONS, DO NOT USE GLOBAL CONSTANTS ####
+
+import torch
+
+TORCH_CPU_DEVICE = torch.device("cpu")
+
+if(torch.cuda.device_count() > 0):
+    TORCH_CUDA_DEVICE = torch.device("cuda:0")
+else:
+    print("----- WARNING: CUDA devices not detected. This will cause the model to run very slow! -----")
+    print("")
+    TORCH_CUDA_DEVICE = None
+
+USE_CUDA = False
+
+# use_cuda
+def use_cuda(cuda_bool):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Sets whether to use CUDA (if available), or use the CPU (not recommended)
+    ----------
+    """
+
+    global USE_CUDA
+    USE_CUDA = cuda_bool
+
+# get_device
+def get_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the default device. Default device is CUDA if available and use_cuda is not False, CPU otherwise.
+    ----------
+    """
+
+    if((not USE_CUDA) or (TORCH_CUDA_DEVICE is None)):
+        return TORCH_CPU_DEVICE
+    else:
+        return TORCH_CUDA_DEVICE
+
+# cuda_device
+def cuda_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the cuda device (may be None if CUDA is not available)
+    ----------
+    """
+
+    return TORCH_CUDA_DEVICE
+
+# cpu_device
+def cpu_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the cpu device
+    ----------
+    """
+
+    return TORCH_CPU_DEVICE
diff --git a/utilities/lr_scheduling.py b/utilities/lr_scheduling.py
new file mode 100644
index 0000000000000000000000000000000000000000..6620a03c8d06c7b4dd3b2467db5dfa2b1ac5b9a5
--- /dev/null
+++ b/utilities/lr_scheduling.py
@@ -0,0 +1,58 @@
+import math
+
+# LrStepTracker
+class LrStepTracker:
+    """
+    ----------
+    Author: Ryan Marshall
+    Modified: Damon Gwinn
+    ----------
+    Class for custom learn rate scheduler (to be used by torch.optim.lr_scheduler.LambdaLR).
+
+    Learn rate for each step (batch) given the warmup steps is:
+        lr = [ 1/sqrt(d_model) ] * min[ 1/sqrt(step) , step * (warmup_steps)^-1.5 ]
+
+    This is from Attention is All you Need (https://arxiv.org/abs/1706.03762)
+    ----------
+    """
+
+    def __init__(self, model_dim=512, warmup_steps=4000, init_steps=0):
+        # Store Values
+        self.warmup_steps = warmup_steps
+        self.model_dim = model_dim
+        self.init_steps = init_steps
+
+        # Begin Calculations
+        self.invsqrt_dim = (1 / math.sqrt(model_dim))
+        self.invsqrt_warmup = (1 / (warmup_steps * math.sqrt(warmup_steps)))
+
+    # step
+    def step(self, step):
+        """
+        ----------
+        Author: Ryan Marshall
+        Modified: Damon Gwinn
+        ----------
+        Method to pass to LambdaLR. Increments the step and computes the new learn rate.
+        ----------
+        """
+
+        step += self.init_steps
+        if(step <= self.warmup_steps):
+            return self.invsqrt_dim * self.invsqrt_warmup * step
+        else:
+            invsqrt_step = (1 / math.sqrt(step))
+            return self.invsqrt_dim * invsqrt_step
+
+# get_lr
+def get_lr(optimizer):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Hack to get the current learn rate of the model
+    ----------
+    """
+
+    for param_group in optimizer.param_groups:
+        return param_group['lr']
diff --git a/utilities/preprocessing.py b/utilities/preprocessing.py
new file mode 100755
index 0000000000000000000000000000000000000000..e0c59e653f39aad11928d223e0087b4e33e78423
--- /dev/null
+++ b/utilities/preprocessing.py
@@ -0,0 +1,39 @@
+import torch as th
+
+class Normalize(object):
+
+    def __init__(self, mean, std):
+        self.mean = th.FloatTensor(mean).view(1, 3, 1, 1)
+        self.std = th.FloatTensor(std).view(1, 3, 1, 1)
+
+    def __call__(self, tensor):
+        tensor = (tensor - self.mean) / (self.std + 1e-8)
+        return tensor
+
+class Preprocessing(object):
+
+    def __init__(self, type):
+        self.type = type
+        if type == '2d':
+            self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        elif type == '3d':
+            self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0])
+
+    def _zero_pad(self, tensor, size):
+        n = size - len(tensor) % size
+        if n == size:
+            return tensor
+        else:
+            z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3])
+            return th.cat((tensor, z), 0)
+
+    def __call__(self, tensor):
+        if self.type == '2d':
+            tensor = tensor / 255.0
+            tensor = self.norm(tensor)
+        elif self.type == '3d':
+            tensor = self._zero_pad(tensor, 16)
+            tensor = self.norm(tensor)
+            tensor = tensor.view(-1, 16, 3, 112, 112)
+            tensor = tensor.transpose(1, 2)
+        return tensor
diff --git a/utilities/run_model_regression.py b/utilities/run_model_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6eb1deaf004390dc32a2d80167df032166a9f5d
--- /dev/null
+++ b/utilities/run_model_regression.py
@@ -0,0 +1,120 @@
+import torch
+import time
+
+from .constants import *
+from utilities.device import get_device
+from .lr_scheduling import get_lr
+import torch.nn.functional as F
+
+def train_epoch(cur_epoch, model, dataloader, loss, opt, lr_scheduler=None, print_modulus=1):
+    out = -1
+    model.train()
+    for batch_num, batch in enumerate(dataloader):
+        time_before = time.time()
+        opt.zero_grad()
+
+        feature_semantic_list = [] 
+        for feature_semantic in batch["semanticList"]:
+            feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+        feature_scene_offset = batch["scene_offset"].to(get_device())
+        feature_motion = batch["motion"].to(get_device())
+        feature_emotion = batch["emotion"].to(get_device())
+
+        feature_note_density = batch["note_density"].to(get_device())
+        feature_loudness = batch["loudness"].to(get_device())
+
+        y = model(
+                  feature_semantic_list, 
+                  feature_scene_offset,
+                  feature_motion,
+                  feature_emotion)
+        
+        y   = y.reshape(y.shape[0] * y.shape[1], -1)
+        
+        feature_loudness = feature_loudness.flatten().reshape(-1,1) # (300, 1)
+        feature_note_density = feature_note_density.flatten().reshape(-1,1) # (300, 1)        
+        feature_combined = torch.cat((feature_note_density, feature_loudness), dim=1) # (300, 2)
+
+        out = loss.forward(y, feature_combined)
+        out.backward()
+        opt.step()
+        
+        if(lr_scheduler is not None):
+            lr_scheduler.step()
+        time_after = time.time()
+        time_took = time_after - time_before
+        
+        if((batch_num+1) % print_modulus == 0):
+            print(SEPERATOR)
+            print("Epoch", cur_epoch, " Batch", batch_num+1, "/", len(dataloader))
+            print("LR:", get_lr(opt))
+            print("Train loss:", float(out))
+            print("")
+            print("Time (s):", time_took)
+            print(SEPERATOR)
+            print("")
+    return
+
+def eval_model(model, dataloader, loss):
+    model.eval()
+    
+    avg_rmse     = -1
+    avg_loss    = -1
+    avg_rmse_note_density     = -1
+    avg_rmse_loudness     = -1
+    with torch.set_grad_enabled(False):
+        n_test      = len(dataloader)
+        
+        sum_loss   = 0.0
+        
+        sum_rmse    = 0.0
+        sum_rmse_note_density = 0.0
+        sum_rmse_loudness = 0.0
+
+        for batch in dataloader:
+            feature_semantic_list = [] 
+            for feature_semantic in batch["semanticList"]:
+                feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+            feature_scene_offset = batch["scene_offset"].to(get_device())
+            feature_motion = batch["motion"].to(get_device())
+            feature_emotion = batch["emotion"].to(get_device())
+            feature_loudness = batch["loudness"].to(get_device())
+            feature_note_density = batch["note_density"].to(get_device())
+            
+            y = model(
+                    feature_semantic_list, 
+                    feature_scene_offset,
+                    feature_motion,
+                    feature_emotion)
+            
+            y   = y.reshape(y.shape[0] * y.shape[1], -1)
+
+            feature_loudness = feature_loudness.flatten().reshape(-1,1) # (300, 1)
+            feature_note_density = feature_note_density.flatten().reshape(-1,1) # (300, 1)        
+            feature_combined = torch.cat((feature_note_density, feature_loudness), dim=1) # (300, 2)
+
+            mse = F.mse_loss(y, feature_combined)
+            rmse = torch.sqrt(mse)
+            sum_rmse += float(rmse)
+
+            y_note_density, y_loudness = torch.split(y, split_size_or_sections=1, dim=1)
+
+            mse_note_density = F.mse_loss(y_note_density, feature_note_density)
+            rmse_note_density = torch.sqrt(mse_note_density)
+            sum_rmse_note_density += float(rmse_note_density)
+            
+            mse_loudness = F.mse_loss(y_loudness, feature_loudness)
+            rmse_loudness = torch.sqrt(mse_loudness)
+            sum_rmse_loudness += float(rmse_loudness)
+
+            out = loss.forward(y, feature_combined)
+            sum_loss += float(out)
+            
+        avg_loss    = sum_loss / n_test
+        avg_rmse     = sum_rmse / n_test
+        avg_rmse_note_density     = sum_rmse_note_density / n_test
+        avg_rmse_loudness     = sum_rmse_loudness / n_test
+
+    return avg_loss, avg_rmse, avg_rmse_note_density, avg_rmse_loudness
diff --git a/utilities/run_model_vevo.py b/utilities/run_model_vevo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b8349d4e25355e75b8543699754fe86752f050f
--- /dev/null
+++ b/utilities/run_model_vevo.py
@@ -0,0 +1,525 @@
+import torch
+import time
+
+from .constants import *
+from utilities.device import get_device
+from .lr_scheduling import get_lr
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+import json
+
+from dataset.vevo_dataset import compute_vevo_accuracy, compute_vevo_correspondence, compute_hits_k, compute_hits_k_root_attr, compute_vevo_accuracy_root_attr, compute_vevo_correspondence_root_attr
+
+def train_epoch(cur_epoch, model, dataloader, 
+                train_loss_func, train_loss_emotion_func,
+                opt, lr_scheduler=None, print_modulus=1, isVideo=True):
+    
+    loss_chord = -1
+    loss_emotion = -1
+    model.train()
+    for batch_num, batch in enumerate(dataloader):
+        time_before = time.time()
+        opt.zero_grad()
+
+        x   = batch["x"].to(get_device())
+        tgt = batch["tgt"].to(get_device())
+        x_root   = batch["x_root"].to(get_device())
+        tgt_root = batch["tgt_root"].to(get_device())
+        x_attr   = batch["x_attr"].to(get_device())
+        tgt_attr = batch["tgt_attr"].to(get_device())
+        tgt_emotion = batch["tgt_emotion"].to(get_device())
+        tgt_emotion_prob = batch["tgt_emotion_prob"].to(get_device())
+        
+        feature_semantic_list = [] 
+        for feature_semantic in batch["semanticList"]:
+            feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+        feature_key = batch["key"].to(get_device())
+        feature_scene_offset = batch["scene_offset"].to(get_device())
+        feature_motion = batch["motion"].to(get_device())
+        feature_emotion = batch["emotion"].to(get_device())
+
+        if isVideo:
+            # use VideoMusicTransformer
+            if IS_SEPERATED:
+                y_root, y_attr = model(x,
+                        x_root,
+                        x_attr,
+                        feature_semantic_list, 
+                        feature_key, 
+                        feature_scene_offset,
+                        feature_motion,
+                        feature_emotion)
+                
+                y_root   = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+                y_attr   = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+                    
+                tgt_root = tgt_root.flatten()
+                tgt_attr = tgt_attr.flatten()
+
+                tgt_emotion = tgt_emotion.squeeze()
+
+                loss_chord_root = train_loss_func.forward(y_root, tgt_root)
+                loss_chord_attr = train_loss_func.forward(y_attr, tgt_attr)
+                loss_chord = loss_chord_root + loss_chord_attr
+
+                first_14 = tgt_emotion[:, :14]
+                last_2 = tgt_emotion[:, -2:]
+                tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+
+                loss_emotion = train_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+
+                total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+                total_loss.backward()
+                opt.step()
+                if(lr_scheduler is not None):
+                    lr_scheduler.step()
+                
+            else:
+                #videomusic tran nosep
+                y = model(x,
+                        x_root,
+                        x_attr,
+                        feature_semantic_list, 
+                        feature_key, 
+                        feature_scene_offset,
+                        feature_motion,
+                        feature_emotion)
+                
+                y   = y.reshape(y.shape[0] * y.shape[1], -1)
+                tgt = tgt.flatten()
+                tgt_emotion = tgt_emotion.squeeze()
+                loss_chord = train_loss_func.forward(y, tgt)
+                loss_emotion = train_loss_emotion_func.forward(y, tgt_emotion)
+                total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+                total_loss.backward()
+                opt.step()
+                if(lr_scheduler is not None):
+                    lr_scheduler.step()
+
+        else:
+            # music transformer
+            if IS_SEPERATED:
+                y_root, y_attr  = model(x,
+                    x_root,
+                    x_attr,
+                    feature_key)
+                
+                y_root   = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+                y_attr   = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+                    
+                tgt_root = tgt_root.flatten()
+                tgt_attr = tgt_attr.flatten()
+
+                tgt_emotion = tgt_emotion.squeeze()
+
+                loss_chord_root = train_loss_func.forward(y_root, tgt_root)
+                loss_chord_attr = train_loss_func.forward(y_attr, tgt_attr)
+
+                loss_chord = loss_chord_root + loss_chord_attr
+                loss_emotion = -1
+              
+                total_loss = loss_chord
+                total_loss.backward()
+                opt.step()
+                if(lr_scheduler is not None):
+                    lr_scheduler.step()
+            else:
+                # use MusicTransformer (no sep)
+                y = model(x,
+                        x_root,
+                        x_attr,
+                        feature_key)
+                
+                y   = y.reshape(y.shape[0] * y.shape[1], -1)
+                tgt = tgt.flatten()
+
+                loss_chord = train_loss_func.forward(y, tgt)
+                loss_emotion = -1
+
+                total_loss = loss_chord
+                total_loss.backward()
+
+                opt.step()
+
+                if(lr_scheduler is not None):
+                    lr_scheduler.step()
+
+        time_after = time.time()
+        time_took = time_after - time_before
+        
+        if((batch_num+1) % print_modulus == 0):
+            print(SEPERATOR)
+            print("Epoch", cur_epoch, " Batch", batch_num+1, "/", len(dataloader))
+            print("LR:", get_lr(opt))
+            print("Train loss (total):", float(total_loss))
+            print("Train loss (chord):", float(loss_chord))
+            print("Train loss (emotion):", float(loss_emotion))
+            print("")
+            print("Time (s):", time_took)
+            print(SEPERATOR)
+            print("")
+    return
+
+def eval_model(model, dataloader, 
+               eval_loss_func, eval_loss_emotion_func,
+               isVideo = True, isGenConfusionMatrix=False):
+    model.eval()
+    avg_acc     = -1
+    avg_cor     = -1
+    avg_acc_cor = -1
+
+    avg_h1 = -1
+    avg_h3 = -1
+    avg_h5 = -1
+    
+    avg_loss_chord    = -1
+    avg_loss_emotion    = -1
+    avg_total_loss    = -1
+
+    true_labels = []
+    true_root_labels = []
+    true_attr_labels = []
+    
+    pred_labels = []
+    pred_root_labels = []
+    pred_attr_labels = []
+    
+    with torch.set_grad_enabled(False):
+        n_test      = len(dataloader)
+        n_test_cor = 0 
+
+        sum_loss_chord   = 0.0
+        sum_loss_emotion  = 0.0
+        sum_total_loss   = 0.0
+
+        sum_acc    = 0.0
+        sum_cor = 0.0
+
+        sum_h1 = 0.0
+        sum_h3 = 0.0
+        sum_h5 = 0.0
+        
+        for batch in dataloader:
+            x   = batch["x"].to(get_device())
+            tgt = batch["tgt"].to(get_device())
+            x_root   = batch["x_root"].to(get_device())
+            tgt_root = batch["tgt_root"].to(get_device())
+            x_attr   = batch["x_attr"].to(get_device())
+            tgt_attr = batch["tgt_attr"].to(get_device())
+            tgt_emotion = batch["tgt_emotion"].to(get_device())
+            tgt_emotion_prob = batch["tgt_emotion_prob"].to(get_device())
+            
+            feature_semantic_list = [] 
+            for feature_semantic in batch["semanticList"]:
+                feature_semantic_list.append( feature_semantic.to(get_device()) )
+            
+            feature_key = batch["key"].to(get_device())
+            feature_scene_offset = batch["scene_offset"].to(get_device())
+            feature_motion = batch["motion"].to(get_device())
+            feature_emotion = batch["emotion"].to(get_device())
+
+            if isVideo:
+                if IS_SEPERATED:
+                    y_root, y_attr = model(x,
+                            x_root,
+                            x_attr,
+                            feature_semantic_list, 
+                            feature_key, 
+                            feature_scene_offset,
+                            feature_motion,
+                            feature_emotion)
+
+                    sum_acc += float(compute_vevo_accuracy_root_attr(y_root, y_attr, tgt))
+                    cor = float(compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+                    if cor >= 0 :
+                        n_test_cor +=1
+                        sum_cor += cor
+
+                    sum_h1 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,1))
+                    sum_h3 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,3))
+                    sum_h5 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,5))
+                    
+                    y_root   = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+                    y_attr   = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+                    
+                    tgt_root = tgt_root.flatten()
+                    tgt_attr = tgt_attr.flatten()
+                    tgt_emotion = tgt_emotion.squeeze()
+
+                    loss_chord_root = eval_loss_func.forward(y_root, tgt_root)
+                    loss_chord_attr = eval_loss_func.forward(y_attr, tgt_attr)
+                    loss_chord = loss_chord_root + loss_chord_attr
+
+                    first_14 = tgt_emotion[:, :14]
+                    last_2 = tgt_emotion[:, -2:]
+                    tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+
+                    loss_emotion = eval_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+                    total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+                    sum_loss_chord += float(loss_chord)
+                    sum_loss_emotion += float(loss_emotion)
+                    sum_total_loss += float(total_loss)
+                else:
+                    y= model(x,
+                            x_root,
+                            x_attr,
+                            feature_semantic_list, 
+                            feature_key, 
+                            feature_scene_offset,
+                            feature_motion,
+                            feature_emotion)
+                    
+                    sum_acc += float(compute_vevo_accuracy(y, tgt ))
+                    cor = float(compute_vevo_correspondence(y, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+                    if cor >= 0 :
+                        n_test_cor +=1
+                        sum_cor += cor
+
+                    sum_h1 += float(compute_hits_k(y, tgt,1))
+                    sum_h3 += float(compute_hits_k(y, tgt,3))
+                    sum_h5 += float(compute_hits_k(y, tgt,5))
+                    
+                    y   = y.reshape(y.shape[0] * y.shape[1], -1)
+
+                    tgt = tgt.flatten()
+                    tgt_root = tgt_root.flatten()
+                    tgt_attr = tgt_attr.flatten()
+                    
+                    tgt_emotion = tgt_emotion.squeeze()
+
+                    loss_chord = eval_loss_func.forward(y, tgt)
+                    loss_emotion = eval_loss_emotion_func.forward(y, tgt_emotion)
+                    total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+                    sum_loss_chord += float(loss_chord)
+                    sum_loss_emotion += float(loss_emotion)
+                    sum_total_loss += float(total_loss)
+
+                    if isGenConfusionMatrix:
+                        pred = y.argmax(dim=1).detach().cpu().numpy()
+                        pred_root = []
+                        pred_attr = []
+
+                        for i in pred:
+                            if i == 0:
+                                pred_root.append(0)
+                                pred_attr.append(0)
+                            elif i == 157:
+                                pred_root.append(CHORD_ROOT_END)
+                                pred_attr.append(CHORD_ATTR_END)
+                            elif i == 158:
+                                pred_root.append(CHORD_ROOT_PAD)
+                                pred_attr.append(CHORD_ATTR_PAD)
+                            else:
+                                rootindex =  int( (i-1)/13 ) + 1
+                                attrindex =  (i-1)%13 + 1
+                                pred_root.append(rootindex)
+                                pred_attr.append(attrindex)
+                        
+                        pred_root = np.array(pred_root)
+                        pred_attr = np.array(pred_attr)
+
+                        true = tgt.detach().cpu().numpy()
+                        true_root = tgt_root.detach().cpu().numpy()
+                        true_attr = tgt_attr.detach().cpu().numpy()
+                        
+                        pred_labels.extend(pred)
+                        pred_root_labels.extend(pred_root)
+                        pred_attr_labels.extend(pred_attr)
+                        
+                        true_labels.extend(true)
+                        true_root_labels.extend(true_root)
+                        true_attr_labels.extend(true_attr)
+            else:
+                if IS_SEPERATED:
+                    y_root, y_attr  = model(x,
+                        x_root,
+                        x_attr,
+                        feature_key)
+
+                    sum_acc += float(compute_vevo_accuracy_root_attr(y_root, y_attr, tgt))
+                    cor = float(compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+                    if cor >= 0 :
+                        n_test_cor +=1
+                        sum_cor += cor
+
+                    sum_h1 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,1))
+                    sum_h3 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,3))
+                    sum_h5 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,5))
+                    
+                    y_root   = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+                    y_attr   = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+                    
+                    tgt_root = tgt_root.flatten()
+                    tgt_attr = tgt_attr.flatten()
+                    tgt_emotion = tgt_emotion.squeeze()
+
+                    loss_chord_root = eval_loss_func.forward(y_root, tgt_root)
+                    loss_chord_attr = eval_loss_func.forward(y_attr, tgt_attr)
+                    loss_chord = loss_chord_root + loss_chord_attr
+
+                    first_14 = tgt_emotion[:, :14]
+                    last_2 = tgt_emotion[:, -2:]
+                    tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+                    loss_emotion = eval_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+                    
+                    total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+                    sum_loss_chord += float(loss_chord)
+                    sum_loss_emotion += float(loss_emotion)
+                    sum_total_loss += float(total_loss)
+                else:
+                    # use MusicTransformer no sep
+                    y = model(x,
+                            x_root,
+                            x_attr,
+                            feature_key)
+                    
+                    sum_acc += float(compute_vevo_accuracy(y, tgt ))
+                    cor = float(compute_vevo_correspondence(y, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+                    
+                    if cor >= 0 :
+                        n_test_cor +=1
+                        sum_cor += cor
+
+                    sum_h1 += float(compute_hits_k(y, tgt,1))
+                    sum_h3 += float(compute_hits_k(y, tgt,3))
+                    sum_h5 += float(compute_hits_k(y, tgt,5))
+
+                    tgt_emotion = tgt_emotion.squeeze()
+                    
+                    y   = y.reshape(y.shape[0] * y.shape[1], -1)
+                    tgt = tgt.flatten()
+                    loss_chord = eval_loss_func.forward(y, tgt)
+                    loss_emotion = eval_loss_emotion_func.forward(y, tgt_emotion)
+                    total_loss = loss_chord
+
+                    sum_loss_chord += float(loss_chord)
+                    sum_loss_emotion += float(loss_emotion)
+                    sum_total_loss += float(total_loss)
+
+        avg_loss_chord    = sum_loss_chord / n_test
+        avg_loss_emotion    = sum_loss_emotion / n_test
+        avg_total_loss    = sum_total_loss / n_test
+
+        avg_acc     = sum_acc / n_test
+        avg_cor     = sum_cor / n_test_cor
+        
+        avg_h1     = sum_h1 / n_test
+        avg_h3     = sum_h3 / n_test
+        avg_h5     = sum_h5 / n_test
+        
+        avg_acc_cor = (avg_acc + avg_cor)/ 2.0
+
+    if isGenConfusionMatrix:
+        chordInvDicPath = "./dataset/vevo_meta/chord_inv.json"
+        chordRootInvDicPath = "./dataset/vevo_meta/chord_root_inv.json"
+        chordAttrInvDicPath = "./dataset/vevo_meta/chord_attr_inv.json"
+        
+        with open(chordInvDicPath) as json_file:
+            chordInvDic = json.load(json_file)
+        with open(chordRootInvDicPath) as json_file:
+            chordRootInvDic = json.load(json_file)
+        with open(chordAttrInvDicPath) as json_file:
+            chordAttrInvDic = json.load(json_file)
+
+        # Confusion matrix (CHORD)
+        topChordList = []
+        with open("./dataset/vevo_meta/top_chord.txt", encoding = 'utf-8') as f:
+            for line in f:
+                line = line.strip()
+                line_arr = line.split(" ")
+                if len(line_arr) == 3 :
+                    chordID = line_arr[1]
+                    topChordList.append( int(chordID) )
+        topChordList = np.array(topChordList)
+        topChordList = topChordList[:10]
+        mask = np.isin(true_labels, topChordList)
+        true_labels = np.array(true_labels)[mask]
+        pred_labels = np.array(pred_labels)[mask]
+
+        conf_matrix = confusion_matrix(true_labels, pred_labels, labels=topChordList)
+        label_names = [ chordInvDic[str(label_id)] for label_id in topChordList ]
+        
+        plt.figure(figsize=(8, 6))
+        plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+        plt.title("Confusion Matrix")
+        plt.colorbar()
+        tick_marks = np.arange(len(topChordList))
+        plt.xticks(tick_marks, label_names, rotation=45)
+        plt.yticks(tick_marks, label_names)
+        thresh = conf_matrix.max() / 2.0
+        for i in range(conf_matrix.shape[0]):
+            for j in range(conf_matrix.shape[1]):
+                plt.text(j, i, format(conf_matrix[i, j], 'd'),
+                        ha="center", va="center",
+                        color="white" if conf_matrix[i, j] > thresh else "black")
+        plt.ylabel('True label')
+        plt.xlabel('Predicted label')
+        plt.tight_layout()
+        plt.savefig("confusion_matrix.png")
+        plt.show()
+
+        # Confusion matrix (CHORD ROOT)        
+        chordRootList = np.arange(1, 13)
+        conf_matrix = confusion_matrix(true_root_labels, pred_root_labels, labels= chordRootList )
+        
+        label_names = [ chordRootInvDic[str(label_id)] for label_id in chordRootList ]
+        
+        plt.figure(figsize=(8, 6))
+        plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+        plt.title("Confusion Matrix (Chord root)")
+        plt.colorbar()
+        tick_marks = np.arange(len(chordRootList))
+        plt.xticks(tick_marks, label_names, rotation=45)
+        plt.yticks(tick_marks, label_names)
+        thresh = conf_matrix.max() / 2.0
+        for i in range(conf_matrix.shape[0]):
+            for j in range(conf_matrix.shape[1]):
+                plt.text(j, i, format(conf_matrix[i, j], 'd'),
+                        ha="center", va="center",
+                        color="white" if conf_matrix[i, j] > thresh else "black")
+        plt.ylabel('True label')
+        plt.xlabel('Predicted label')
+        plt.tight_layout()
+        plt.savefig("confusion_matrix_root.png")
+        plt.show()
+
+        # Confusion matrix (CHORD ATTR)
+        chordAttrList = np.arange(1, 14)
+        conf_matrix = confusion_matrix(true_attr_labels, pred_attr_labels, labels= chordAttrList )
+        
+        label_names = [ chordAttrInvDic[str(label_id)] for label_id in chordAttrList ]
+        
+        plt.figure(figsize=(8, 6))
+        plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+        plt.title("Confusion Matrix (Chord quality)")
+        plt.colorbar()
+        tick_marks = np.arange(len(chordAttrList))
+        plt.xticks(tick_marks, label_names, rotation=45)
+        plt.yticks(tick_marks, label_names)
+        thresh = conf_matrix.max() / 2.0
+        for i in range(conf_matrix.shape[0]):
+            for j in range(conf_matrix.shape[1]):
+                plt.text(j, i, format(conf_matrix[i, j], 'd'),
+                        ha="center", va="center",
+                        color="white" if conf_matrix[i, j] > thresh else "black")
+        plt.ylabel('True label')
+        plt.xlabel('Predicted label')
+        plt.tight_layout()
+        plt.savefig("confusion_matrix_quality.png")
+        plt.show()
+
+    return { "avg_total_loss" : avg_total_loss, 
+             "avg_loss_chord" : avg_loss_chord, 
+             "avg_loss_emotion": avg_loss_emotion, 
+             "avg_acc" : avg_acc, 
+             "avg_cor" : avg_cor, 
+             "avg_acc_cor" : avg_acc_cor, 
+             "avg_h1" : avg_h1, 
+             "avg_h3" : avg_h3,
+             "avg_h5" : avg_h5 }
+
diff --git a/utilities/video_loader.py b/utilities/video_loader.py
new file mode 100755
index 0000000000000000000000000000000000000000..9261585228a28c563ba974fa49ce02da2a548572
--- /dev/null
+++ b/utilities/video_loader.py
@@ -0,0 +1,83 @@
+import torch as th
+from torch.utils.data import Dataset
+import pandas as pd
+import os
+import numpy as np
+import ffmpeg
+
+class VideoLoader(Dataset):
+    def __init__(
+            self,
+            fileList = [],
+            framerate=1,
+            size=112,
+            centercrop=False,
+    ):
+        #self.csv = pd.read_csv(csv)
+        self.fileList = fileList
+
+        self.centercrop = centercrop
+        self.size = size
+        self.framerate = framerate
+
+    def __len__(self):
+        return len(self.fileList)
+
+    def _get_video_dim(self, video_path):
+        probe = ffmpeg.probe(video_path)
+        video_stream = next((stream for stream in probe['streams']
+                             if stream['codec_type'] == 'video'), None)
+        width = int(video_stream['width'])
+        height = int(video_stream['height'])
+        return height, width
+
+    def _get_output_dim(self, h, w):
+        if isinstance(self.size, tuple) and len(self.size) == 2:
+            return self.size
+        elif h >= w:
+            return int(h * self.size / w), self.size
+        else:
+            return self.size, int(w * self.size / h)
+
+    def __getitem__(self, idx):
+
+        video_path = self.fileList[idx]
+        output_file = video_path[:video_path.rfind(".")] + ".npy"
+
+        #video_path = self.csv['video_path'].values[idx]
+        #output_file = self.csv['feature_path'].values[idx]
+
+        if not(os.path.isfile(output_file)) and os.path.isfile(video_path):
+            print('Decoding video: {}'.format(video_path))
+
+
+            try:
+                h, w = self._get_video_dim(video_path)
+            except:
+                print('ffprobe failed at: {}'.format(video_path))
+                return {'video': th.zeros(1), 'input': video_path,
+                        'output': output_file}
+            height, width = self._get_output_dim(h, w)
+            cmd = (
+                ffmpeg
+                .input(video_path)
+                .filter('fps', fps=self.framerate)
+                .filter('scale', width, height)
+            )
+            if self.centercrop:
+                x = int((width - self.size) / 2.0)
+                y = int((height - self.size) / 2.0)
+                cmd = cmd.crop(x, y, self.size, self.size)
+            out, _ = (
+                cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24')
+                .run(capture_stdout=True, quiet=True)
+            )
+            if self.centercrop and isinstance(self.size, int):
+                height, width = self.size, self.size
+            video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+            video = th.from_numpy(video.astype('float32'))
+            video = video.permute(0, 3, 1, 2)
+        else:
+            video = th.zeros(1)
+            
+        return {'video': video, 'input': video_path, 'output': output_file}