diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..144cb69b1c6da603b1fe06698fac665485699636 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.sf2 filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
index a699bc5b3c2e987102ca93e0ee28d601e0a93d02..418f623e612ce5f8c695c82b7cfd4131318e34d2 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,741 @@
import gradio as gr
+from pathlib import Path
-def greet(name):
- return "Hello " + name + "!!"
+import torch
+import shutil
+import os
+import subprocess
+import cv2
+import math
+import clip
+import numpy as np
+from PIL import Image
+from scenedetect import open_video, SceneManager, split_video_ffmpeg
+from scenedetect.detectors import ContentDetector, AdaptiveDetector
+from scenedetect.video_splitter import split_video_ffmpeg
+from scenedetect.scene_manager import save_images
+from utilities.constants import *
+from utilities.chord_to_midi import *
+
+from model.video_music_transformer import VideoMusicTransformer
+from model.video_regression import VideoRegression
+
+import json
+from midi2audio import FluidSynth
+import moviepy.editor as mp
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+import random
+from moviepy.editor import *
+import time
+
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+
+from gradio import Markdown
+
+all_key_names = ['C major', 'G major', 'D major', 'A major',
+ 'E major', 'B major', 'F major', 'Bb major',
+ 'Eb major', 'Ab major', 'Db major', 'Gb major',
+ 'A minor', 'E minor', 'B minor', 'F# minor',
+ 'C# minor', 'G# minor', 'D minor', 'G minor',
+ 'C minor', 'F minor', 'Bb minor', 'Eb minor',
+ ]
+
+traspose_key_dic = {
+ 'F major' : -7,
+ 'Gb major' : -6,
+ 'G major' : -5,
+ 'Ab major' : -4,
+ 'A major' : -3,
+ 'Bb major' : -2,
+ 'B major' : -1,
+ 'C major' : 0,
+ 'Db major' : 1,
+ 'D major' : 2,
+ 'Eb major' : 3,
+ 'E major' : 4,
+ 'D minor' : -7,
+ 'Eb minor' : -6,
+ 'E minor' : -5,
+ 'F minor' : -4,
+ 'F# minor' : -3,
+ 'G minor' : -2,
+ 'G# minor' : -1,
+ 'A minor' : 0,
+ 'Bb minor' : 1,
+ 'B minor' : 2,
+ 'C minor' : 3,
+ 'C# minor' : 4
+}
+
+flatsharpDic = {
+ 'Db':'C#',
+ 'Eb':'D#',
+ 'Gb':'F#',
+ 'Ab':'G#',
+ 'Bb':'A#'
+}
+
+max_conseq_N = 0
+max_conseq_chord = 2
+tempo = 120
+duration = 2
+
+min_loudness = 0 # Minimum loudness level in the input range
+max_loudness = 50 # Maximum loudness level in the input range
+min_velocity = 49 # Minimum velocity value in the output range
+max_velocity = 112 # Maximum velocity value in the output range
+
+
+def split_video_into_frames(video, frame_dir):
+ output_path = os.path.join(frame_dir, f"%03d.jpg")
+ cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"
+ subprocess.call(cmd, shell=True)
+
+def gen_semantic_feature(frame_dir, semantic_dir):
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ model, preprocess = clip.load("ViT-L/14@336px", device=device)
+ file_names = os.listdir(frame_dir)
+ sorted_file_names = sorted(file_names)
+
+ output_path = semantic_dir / "semantic.npy"
+ features = torch.cuda.FloatTensor(len(sorted_file_names), 768).fill_(0)
+
+ for idx, file_name in enumerate(sorted_file_names):
+ fpath = frame_dir / file_name
+ image = preprocess(Image.open(fpath)).unsqueeze(0).to(device)
+ with torch.no_grad():
+ image_features = model.encode_image(image)
+ features[idx] = image_features[0]
+ features = features.cpu().numpy()
+ np.save(output_path, features)
+
+def gen_emotion_feature(frame_dir, emotion_dir):
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ model, preprocess = clip.load("ViT-L/14@336px", device=device)
+ text = clip.tokenize(["exciting", "fearful", "tense", "sad", "relaxing", "neutral"]).to(device)
+
+ file_names = os.listdir(frame_dir)
+ sorted_file_names = sorted(file_names)
+ output_path = emotion_dir / "emotion.lab"
+
+ emolist = []
+ for file_name in sorted_file_names:
+ fpath = frame_dir / file_name
+ image = preprocess(Image.open(fpath)).unsqueeze(0).to(device)
+ with torch.no_grad():
+ logits_per_image, logits_per_text = model(image, text)
+ probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+ fp1 = format(probs[0][0], ".4f")
+ fp2 = format(probs[0][1], ".4f")
+ fp3 = format(probs[0][2], ".4f")
+ fp4 = format(probs[0][3], ".4f")
+ fp5 = format(probs[0][4], ".4f")
+ fp6 = format(probs[0][5], ".4f")
+
+ emo_val = str(fp1) +" "+ str(fp2) +" "+ str(fp3) +" "+ str(fp4) +" "+ str(fp5) + " " + str(fp6)
+ emolist.append(emo_val)
+
+ with open(output_path ,'w' ,encoding = 'utf-8') as f:
+ f.write("time exciting_prob fearful_prob tense_prob sad_prob relaxing_prob neutral_prob\n")
+ for i in range(0, len(emolist) ):
+ f.write(str(i) + " "+emolist[i]+"\n")
+
+def gen_scene_feature(video, scene_dir):
+ video_stream = open_video(str(video))
+
+ scene_manager = SceneManager()
+ scene_manager.add_detector(AdaptiveDetector())
+ scene_manager.detect_scenes(video_stream, show_progress=False)
+ scene_list = scene_manager.get_scene_list()
+
+ sec = 0
+ scenedict = {}
+ for idx, scene in enumerate(scene_list):
+ end_int = math.ceil(scene[1].get_seconds())
+ for s in range (sec, end_int):
+ scenedict[s] = str(idx)
+ sec += 1
+
+ fpathname = scene_dir / "scene.lab"
+ with open(fpathname,'w',encoding = 'utf-8') as f:
+ for i in range(0, len(scenedict)):
+ f.write(str(i) + " "+scenedict[i]+"\n")
+
+def gen_scene_offset_feature(scene_dir, scene_offset_dir):
+ src = scene_dir / "scene.lab"
+ tgt = scene_offset_dir / "scene_offset.lab"
+
+ id_list = []
+ with open(src, encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ if len(line_arr) == 2 :
+ time = int(line_arr[0])
+ scene_id = int(line_arr[1])
+ id_list.append(scene_id)
+
+ offset_list = []
+ current_id = id_list[0]
+ offset = 0
+ for i in range(len(id_list)):
+ if id_list[i] != current_id:
+ current_id = id_list[i]
+ offset = 0
+ offset_list.append(offset)
+ offset += 1
+
+ with open(tgt,'w',encoding = 'utf-8') as f:
+ for i in range(0, len(offset_list)):
+ f.write(str(i) + " " + str(offset_list[i]) + "\n")
+
+def gen_motion_feature(video, motion_dir):
+ cap = cv2.VideoCapture(str(video))
+ prev_frame = None
+ prev_time = 0
+ motion_value = 0
+ motiondict = {}
+
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if not ret:
+ break
+ curr_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
+ motiondict[0] = "0.0000"
+ if prev_frame is not None and curr_time - prev_time >= 1:
+ diff = cv2.absdiff(frame, prev_frame)
+ diff_rgb = cv2.cvtColor(diff, cv2.COLOR_BGR2RGB)
+ motion_value = diff_rgb.mean()
+ motion_value = format(motion_value, ".4f")
+ motiondict[int(curr_time)] = str(motion_value)
+ prev_time = int(curr_time)
+ prev_frame = frame.copy()
+ cap.release()
+ cv2.destroyAllWindows()
+ fpathname = motion_dir / "motion.lab"
+
+ with open(fpathname,'w',encoding = 'utf-8') as f:
+ for i in range(0, len(motiondict)):
+ f.write(str(i) + " "+motiondict[i]+"\n")
+
+
+# def get_motion_feature(scene_dir, scene_offset_dir):
+# fpath_emotion = emotion_dir / "emotion.lab"
+# fpath_motion = motion_dir / "motion.lab"
+
+def get_scene_offset_feature(scene_offset_dir, max_seq_chord=300, max_seq_video=300):
+ feature_scene_offset = np.empty(max_seq_video)
+ feature_scene_offset.fill(SCENE_OFFSET_PAD)
+ fpath_scene_offset = scene_offset_dir / "scene_offset.lab"
+
+ with open(fpath_scene_offset, encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= max_seq_chord:
+ break
+ sceneID = line_arr[1]
+ feature_scene_offset[time] = int(sceneID)+1
+
+ feature_scene_offset = torch.from_numpy(feature_scene_offset)
+ feature_scene_offset = feature_scene_offset.to(torch.float32)
+
+ return feature_scene_offset
+
+def get_motion_feature(motion_dir, max_seq_chord=300, max_seq_video=300):
+ fpath_motion = motion_dir / "motion.lab"
+ feature_motion = np.empty(max_seq_video)
+ feature_motion.fill(MOTION_PAD)
+ with open(fpath_motion, encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= max_seq_chord:
+ break
+ motion = line_arr[1]
+ feature_motion[time] = float(motion)
+
+ feature_motion = torch.from_numpy(feature_motion)
+ feature_motion = feature_motion.to(torch.float32)
+ return feature_motion
+
+def get_emotion_feature(emotion_dir, max_seq_chord=300, max_seq_video=300):
+ fpath_emotion = emotion_dir / "emotion.lab"
+ feature_emotion = np.empty((max_seq_video, 6))
+ feature_emotion.fill(EMOTION_PAD)
+
+ with open(fpath_emotion, encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ if line_arr[0] == "time":
+ continue
+ time = line_arr[0]
+ time = int(time)
+ if time >= max_seq_chord:
+ break
+ emo1, emo2, emo3, emo4, emo5, emo6 = \
+ line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5],line_arr[6]
+ emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5), float(emo6) ]
+ emoList = np.array(emoList)
+ feature_emotion[time] = emoList
+
+ feature_emotion = torch.from_numpy(feature_emotion)
+ feature_emotion = feature_emotion.to(torch.float32)
+ return feature_emotion
+
+def get_semantic_feature(semantic_dir, max_seq_chord=300, max_seq_video=300):
+ fpath_semantic = semantic_dir / "semantic.npy"
+
+ video_feature = np.load(fpath_semantic)
+ dim_vf = video_feature.shape[1]
+
+ video_feature_tensor = torch.from_numpy( video_feature )
+ feature_semantic = torch.full((max_seq_video, dim_vf,), SEMANTIC_PAD , dtype=torch.float32, device=torch.device("cpu"))
+
+ if(video_feature_tensor.shape[0] < max_seq_video):
+ feature_semantic[:video_feature_tensor.shape[0]] = video_feature_tensor
+ else:
+ feature_semantic = video_feature_tensor[:max_seq_video]
+
+ return feature_semantic
+
+
+def text_clip(text: str, duration: int, start_time: int = 0):
+ t = TextClip(text, font='Georgia-Regular', fontsize=24, color='white')
+ t = t.set_position(("center", 20)).set_duration(duration)
+ t = t.set_start(start_time)
+ return t
+
+def convert_format_id_to_offset(id_list):
+ offset_list = []
+ current_id = id_list[0]
+ offset = 0
+ for i in range(len(id_list)):
+ if id_list[i] != current_id:
+ current_id = id_list[i]
+ offset = 0
+ offset_list.append(offset)
+ offset += 1
+ return offset_list
+
+
+class Video2music:
+ def __init__(
+ self,
+ name="amaai-lab/video2music",
+ device="cuda:0",
+ cache_dir=None,
+ local_files_only=False,
+ ):
+ # path = snapshot_download(repo_id=name, cache_dir=cache_dir)
+
+ self.device = device
+
+ # self.model.device = device
+ # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ # f"{path}/beats/microsoft-deberta-v3-large.pt"
+
+ # self.model_weights = f"{path}/saved_models/AMT/best_loss_weights.pickle"
+ # self.modelReg_weights = f"{path}/saved_models/AMT/best_rmse_weights.pickle"
+
+ self.model_weights = "saved_models/AMT/best_loss_weights.pickle"
+ self.modelReg_weights = "saved_models/AMT/best_rmse_weights.pickle"
+
+ self.total_vf_dim = 776
+ # 768 (sem) + 1 (mo) + 1 (scene) + 6 (emo)
+ self.max_seq_video = 300
+ self.max_seq_chord = 300
+
+ self.model = VideoMusicTransformer(n_layers=6, num_heads=8,
+ d_model=512, dim_feedforward=1024,
+ max_sequence_midi=2048, max_sequence_video=300,
+ max_sequence_chord=300, total_vf_dim=self.total_vf_dim, rpr=RPR).to(device)
+
+ self.model.load_state_dict(torch.load(self.model_weights, map_location=device))
+ self.modelReg = VideoRegression(max_sequence_video=300, total_vf_dim=self.total_vf_dim, regModel= "bigru").to(device)
+ self.modelReg.load_state_dict(torch.load(self.modelReg_weights, map_location=device))
+
+ self.model.eval()
+ self.modelReg.eval()
+
+ self.SF2_FILE = "default_sound_font.sf2"
+
+ def generate(self, video, primer, key):
+
+ feature_dir = Path("./feature")
+ output_dir = Path("./output")
+ if feature_dir.exists():
+ shutil.rmtree(str(feature_dir))
+ if output_dir.exists():
+ shutil.rmtree(str(output_dir))
+
+ feature_dir.mkdir(parents=True)
+ output_dir.mkdir(parents=True)
+
+ frame_dir = feature_dir / "vevo_frame"
+
+ #video features
+ semantic_dir = feature_dir / "vevo_semantic"
+ emotion_dir = feature_dir / "vevo_emotion"
+ scene_dir = feature_dir / "vevo_scene"
+ scene_offset_dir = feature_dir / "vevo_scene_offset"
+ motion_dir = feature_dir / "vevo_motion"
+
+ frame_dir.mkdir(parents=True)
+ semantic_dir.mkdir(parents=True)
+ emotion_dir.mkdir(parents=True)
+ scene_dir.mkdir(parents=True)
+ scene_offset_dir.mkdir(parents=True)
+ motion_dir.mkdir(parents=True)
+
+ #music features
+ chord_dir = feature_dir / "vevo_chord"
+ loudness_dir = feature_dir / "vevo_loudness"
+ note_density_dir = feature_dir / "vevo_note_density"
+
+ chord_dir.mkdir(parents=True)
+ loudness_dir.mkdir(parents=True)
+ note_density_dir.mkdir(parents=True)
+
+ split_video_into_frames(video, frame_dir)
+ gen_semantic_feature(frame_dir, semantic_dir)
+ gen_emotion_feature(frame_dir, emotion_dir)
+ gen_scene_feature(video, scene_dir)
+ gen_scene_offset_feature(scene_dir, scene_offset_dir)
+ gen_motion_feature(video, motion_dir)
+
+ feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
+ feature_motion = get_motion_feature(motion_dir)
+ feature_emotion = get_emotion_feature(emotion_dir)
+ feature_semantic = get_semantic_feature(semantic_dir)
+
+ # cuda
+ feature_scene_offset = feature_scene_offset.to(self.device)
+ feature_motion = feature_motion.to(self.device)
+ feature_emotion = feature_emotion.to(self.device)
+
+ feature_scene_offset = feature_scene_offset.unsqueeze(0)
+ feature_motion = feature_motion.unsqueeze(0)
+ feature_emotion = feature_emotion.unsqueeze(0)
+
+ feature_semantic = feature_semantic.to(self.device)
+ feature_semantic_list = []
+ feature_semantic = torch.unsqueeze(feature_semantic, 0)
+ feature_semantic_list.append( feature_semantic.to(self.device) )
+ #feature_semantic_list.append( feature_semantic )
+
+ if "major" in key:
+ feature_key = torch.tensor([0])
+ feature_key = feature_key.float()
+ elif "minor" in key:
+ feature_key = torch.tensor([1])
+ feature_key = feature_key.float()
+
+ feature_key = feature_key.to(self.device)
+
+ with open('dataset/vevo_meta/chord.json') as json_file:
+ chordDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_inv.json') as json_file:
+ chordInvDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_root.json') as json_file:
+ chordRootDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_attr.json') as json_file:
+ chordAttrDic = json.load(json_file)
+
+ if primer.strip() == "":
+ if "major" in key:
+ primer = "C"
+ else:
+ primer = "Am"
+
+ pChordList = primer.split(" ")
+
+ primerCID = []
+ primerCID_root = []
+ primerCID_attr = []
+
+ for pChord in pChordList:
+ if len(pChord) > 1:
+ if pChord[1] == "b":
+ pChord = flatsharpDic [ pChord[0:2] ] + pChord[2:]
+ type_idx = 0
+ if pChord[1] == "#":
+ pChord = pChord[0:2] + ":" + pChord[2:]
+ type_idx = 2
+ else:
+ pChord = pChord[0:1] + ":" + pChord[1:]
+ type_idx = 1
+ if pChord[type_idx+1:] == "m":
+ pChord = pChord[0:type_idx] + ":min"
+ if pChord[type_idx+1:] == "m6":
+ pChord = pChord[0:type_idx] + ":min6"
+ if pChord[type_idx+1:] == "m7":
+ pChord = pChord[0:type_idx] + ":min7"
+ if pChord[type_idx+1:] == "M6":
+ pChord = pChord[0:type_idx] + ":maj6"
+ if pChord[type_idx+1:] == "M7":
+ pChord = pChord[0:type_idx] + ":maj7"
+ if pChord[type_idx+1:] == "":
+ pChord = pChord[0:type_idx]
+
+ chordID = chordDic[pChord]
+ primerCID.append(chordID)
+
+ chord_arr = pChord.split(":")
+ if len(chord_arr) == 1:
+ chordRootID = chordRootDic[chord_arr[0]]
+ primerCID_root.append(chordRootID)
+ primerCID_attr.append(0)
+ elif len(chord_arr) == 2:
+ chordRootID = chordRootDic[chord_arr[0]]
+ chordAttrID = chordAttrDic[chord_arr[1]]
+ primerCID_root.append(chordRootID)
+ primerCID_attr.append(chordAttrID)
+
+ primerCID = np.array(primerCID)
+ primerCID = torch.from_numpy(primerCID)
+ primerCID = primerCID.to(torch.long)
+ primerCID = primerCID.to(self.device)
+
+ primerCID_root = np.array(primerCID_root)
+ primerCID_root = torch.from_numpy(primerCID_root)
+ primerCID_root = primerCID_root.to(torch.long)
+ primerCID_root = primerCID_root.to(self.device)
+
+ primerCID_attr = np.array(primerCID_attr)
+ primerCID_attr = torch.from_numpy(primerCID_attr)
+ primerCID_attr = primerCID_attr.to(torch.long)
+ primerCID_attr = primerCID_attr.to(self.device)
+
+ # self.model.eval()
+ # self.modelReg.eval()
+
+ with torch.set_grad_enabled(False):
+ rand_seq = self.model.generate(feature_semantic_list=feature_semantic_list,
+ feature_key=feature_key,
+ feature_scene_offset=feature_scene_offset,
+ feature_motion=feature_motion,
+ feature_emotion=feature_emotion,
+ primer = primerCID,
+ primer_root = primerCID_root,
+ primer_attr = primerCID_attr,
+ target_seq_length = 300,
+ beam=0,
+ max_conseq_N= max_conseq_N,
+ max_conseq_chord = max_conseq_chord)
+
+ y = self.modelReg(
+ feature_semantic_list,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+
+ y_note_density, y_loudness = torch.split(y, split_size_or_sections=1, dim=1)
+ y_note_density_np = y_note_density.cpu().numpy()
+ y_note_density_np = np.round(y_note_density_np).astype(int)
+ y_note_density_np = np.clip(y_note_density_np, 0, 40)
+
+ y_loudness_np = y_loudness.cpu().numpy()
+ y_loudness_np_lv = (y_loudness_np * 100).astype(int)
+ y_loudness_np_lv = np.clip(y_loudness_np_lv, 0, 50)
+ velolistExp = []
+ exponent = 0.3
+ for item in y_loudness_np_lv:
+ loudness = item[0]
+ velocity_exp = np.round(((loudness - min_loudness) / (max_loudness - min_loudness)) ** exponent * (max_velocity - min_velocity) + min_velocity)
+ velocity_exp = int(velocity_exp)
+ velolistExp.append(velocity_exp)
+
+ densitylist = []
+ for item in y_loudness_np_lv:
+ density = item[0]
+ if density <= 6:
+ densitylist.append(0)
+ elif density <= 12:
+ densitylist.append(1)
+ elif density <= 18:
+ densitylist.append(2)
+ elif density <= 24:
+ densitylist.append(3)
+ else:
+ densitylist.append(4)
+
+ # generated ChordID to ChordSymbol
+ chord_genlist = []
+ chordID_genlist= rand_seq[0].cpu().numpy()
+ for i in chordID_genlist:
+ chord_genlist.append(chordInvDic[str(i)])
+
+ chord_offsetlist = convert_format_id_to_offset(chord_genlist)
+ f_path_midi = output_dir / "output.mid"
+ f_path_flac = output_dir / "output.flac"
+ f_path_video_out = output_dir / "output.mp4"
+
+ # ChordSymbol to MIDI file with voicing
+ MIDI = MIDIFile(1)
+ MIDI.addTempo(0, 0, tempo)
+ midi_chords_orginal = []
+ for i, k in enumerate(chord_genlist):
+ k = k.replace(":", "")
+ if k == "N":
+ midi_chords_orginal.append([])
+ else:
+ midi_chords_orginal.append(Chord(k).getMIDI("c", 4))
+ midi_chords = voice(midi_chords_orginal)
+ trans = traspose_key_dic[key]
+
+ for i, chord in enumerate(midi_chords):
+ if densitylist[i] == 0:
+ if len(chord) >= 4:
+ if chord_offsetlist[i] % 2 == 0:
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 1 , duration, velolistExp[i])
+ else:
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1 , duration, velolistExp[i])
+ elif densitylist[i] == 1:
+ if len(chord) >= 4:
+ if chord_offsetlist[i] % 2 == 0:
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1 , duration, velolistExp[i])
+ else:
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1 , duration, velolistExp[i])
+ elif densitylist[i] == 2:
+ if len(chord) >= 4:
+ if chord_offsetlist[i] % 2 == 0:
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ else:
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ elif densitylist[i] == 3:
+ if len(chord) >= 4:
+ if chord_offsetlist[i] % 2 == 0:
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.75 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ else:
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0.75 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ elif densitylist[i] == 4:
+ if len(chord) >= 4:
+ if chord_offsetlist[i] % 2 == 0:
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.75 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.75 , duration, velolistExp[i])
+ else:
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[0]+trans, i * duration + 0.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 0.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 0.75 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[3]+trans, i * duration + 1 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.25 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[1]+trans, i * duration + 1.5 , duration, velolistExp[i])
+ MIDI.addNote(0, 0, chord[2]+trans, i * duration + 1.75 , duration, velolistExp[i])
+
+ with open(f_path_midi, "wb") as outputFile:
+ MIDI.writeFile(outputFile)
+
+ # Convert midi to audio (e.g., flac)
+ fs = FluidSynth(sound_font=self.SF2_FILE)
+ fs.midi_to_audio(str(f_path_midi), str(f_path_flac))
+
+ # Render generated music into input video
+ audio_mp = mp.AudioFileClip(str(f_path_flac))
+ video_mp = mp.VideoFileClip(str(video))
+
+ audio_mp = audio_mp.subclip(0, video_mp.duration )
+ final = video_mp.set_audio(audio_mp)
+
+ final.write_videofile(str(f_path_video_out),
+ codec='libx264',
+ audio_codec='aac',
+ temp_audiofile='temp-audio.m4a',
+ remove_temp=True
+ )
+ return Path(str(f_path_video_out))
+
+
+# Initialize Mustango
+if torch.cuda.is_available():
+ video2music = Video2music()
+else:
+ video2music = Video2music(device="cpu")
+
+
+def gradio_generate(input_video, input_primer, input_key):
+ output_filename = video2music.generate(input_video, input_primer, input_key)
+ return str(output_filename)
+
+
+title="Video2Music: Suitable Music Generation from Videos using an Affective Multimodal Transformer model"
+description_text = """
+
+Generate background music using Video2Music by providing an input video.
+
This is the demo for Video2Music: Suitable Music Generation from Videos using an Affective Multimodal Transformer model
+Read our paper.
+
+"""
+input_video = gr.Video(label="Input Video")
+input_primer = gr.Textbox(label="Input Primer", value="C Am F G")
+input_key = gr.Dropdown(choices=["C major", "A minor"], value="C major", label="Input Key")
+output_video = gr.Video(label="Output Video")
+
+css = '''
+#duplicate-button {
+margin: auto;
+color: white;
+background: #1565c0;
+border-radius: 100vh;
+}
+'''
+
+# Gradio interface
+gr_interface = gr.Interface(
+ fn=gradio_generate,
+ inputs=[input_video, input_primer, input_key ],
+ outputs=[output_video],
+ description=description_text,
+ allow_flagging='never',
+ cache_examples=True,
+)
+
+
+# with gr.Blocks() as demo:
+with gr.Blocks(css=css) as demo:
+ title=gr.HTML(f"{title}
")
+ gr_interface.render()
+
+#demo.queue()
+# demo.launch(debug=True)
+
+demo.queue().launch()
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()
\ No newline at end of file
diff --git a/dataset/README.md b/dataset/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8665e83e25d91c7bce9b6351679a6866b152df11
--- /dev/null
+++ b/dataset/README.md
@@ -0,0 +1,31 @@
+# MuVi-Sync Dataset
+
+- Dataset (MuVi-Sync)
+ * MuVi-Sync (features) [(Link)](https://zenodo.org/records/10057093)
+ * MuVi-Sync (original video) [(Link)](https://zenodo.org/records/10050294)
+
+## Overview
+Welcome to the MuVi-Sync dataset! This collection provides a rich array of features for both music and video elements. Here's a breakdown of the directory structure:
+
+### Music Features
+- **vevo_chord:** Chord feature data
+- **vevo_note_density:** Note density feature data
+- **vevo_loudness:** Loudness feature data
+
+### Video Features
+- **vevo_scene_offset:** Scene offset feature data
+- **vevo_emotion:** Emotion feature data
+ - *5c_l14p:* 5 emotion categories (exciting, fearful, tense, sad, relaxing)
+ - *6c_l14p:* 6 emotion categories (exciting, fearful, tense, sad, relaxing, neutral)
+- **vevo_semantic:** Semantic feature
+- **vevo_motion:** Motion feature
+
+### Others
+- **vevo_meta:**
+ - *idlist.txt:* List of features, titles, and YouTube IDs
+- **vevo:** Original video files (.mp4)
+
+Explore and utilize this dataset for innovative research and applications.
+
+For more details, refer to our [GitHub repository](https://github.com/AMAAI-Lab/Video2Music).
+
diff --git a/dataset/__pycache__/vevo_dataset.cpython-37.pyc b/dataset/__pycache__/vevo_dataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba60ba07fae3faba4dfc512743065182986e39f0
Binary files /dev/null and b/dataset/__pycache__/vevo_dataset.cpython-37.pyc differ
diff --git a/dataset/vevo_dataset.py b/dataset/vevo_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0373f5e15120481da4ea524d254ba121000e762a
--- /dev/null
+++ b/dataset/vevo_dataset.py
@@ -0,0 +1,720 @@
+import os
+import pickle
+import random
+import torch
+import torch.nn as nn
+import numpy as np
+
+from torch.utils.data import Dataset
+from utilities.constants import *
+from utilities.device import cpu_device
+from utilities.device import get_device
+
+import json
+
+SEQUENCE_START = 0
+
+class VevoDataset(Dataset):
+ def __init__(self, dataset_root = "./dataset/", split="train", split_ver="v1", vis_models="2d/clip_l14p", emo_model="6c_l14p", max_seq_chord=300, max_seq_video=300, random_seq=True, is_video = True):
+
+ self.dataset_root = dataset_root
+
+ self.vevo_chord_root = os.path.join( dataset_root, "vevo_chord", "lab_v2_norm", "all")
+ self.vevo_emotion_root = os.path.join( dataset_root, "vevo_emotion", emo_model, "all")
+ self.vevo_motion_root = os.path.join( dataset_root, "vevo_motion", "all")
+ self.vevo_scene_offset_root = os.path.join( dataset_root, "vevo_scene_offset", "all")
+ self.vevo_meta_split_path = os.path.join( dataset_root, "vevo_meta", "split", split_ver, split + ".txt")
+
+ self.vevo_loudness_root = os.path.join( dataset_root, "vevo_loudness", "all")
+ self.vevo_note_density_root = os.path.join( dataset_root, "vevo_note_density", "all")
+
+ self.max_seq_video = max_seq_video
+ self.max_seq_chord = max_seq_chord
+ self.random_seq = random_seq
+ self.is_video = is_video
+
+ self.vis_models_arr = vis_models.split(" ")
+ self.vevo_semantic_root_list = []
+ self.id_list = []
+
+ self.emo_model = emo_model
+
+ if IS_VIDEO:
+ for i in range( len(self.vis_models_arr) ):
+ p1 = self.vis_models_arr[i].split("/")[0]
+ p2 = self.vis_models_arr[i].split("/")[1]
+ vevo_semantic_root = os.path.join(dataset_root, "vevo_semantic" , "all" , p1, p2)
+ self.vevo_semantic_root_list.append( vevo_semantic_root )
+
+ with open( self.vevo_meta_split_path ) as f:
+ for line in f:
+ self.id_list.append(line.strip())
+
+ self.data_files_chord = []
+ self.data_files_emotion = []
+ self.data_files_motion = []
+ self.data_files_scene_offset = []
+ self.data_files_semantic_list = []
+
+ self.data_files_loudness = []
+ self.data_files_note_density = []
+
+ for i in range(len(self.vis_models_arr)):
+ self.data_files_semantic_list.append([])
+
+ for fid in self.id_list:
+ fpath_chord = os.path.join( self.vevo_chord_root, fid + ".lab" )
+ fpath_emotion = os.path.join( self.vevo_emotion_root, fid + ".lab" )
+ fpath_motion = os.path.join( self.vevo_motion_root, fid + ".lab" )
+ fpath_scene_offset = os.path.join( self.vevo_scene_offset_root, fid + ".lab" )
+
+ fpath_loudness = os.path.join( self.vevo_loudness_root, fid + ".lab" )
+ fpath_note_density = os.path.join( self.vevo_note_density_root, fid + ".lab" )
+
+ fpath_semantic_list = []
+ for vevo_semantic_root in self.vevo_semantic_root_list:
+ fpath_semantic = os.path.join( vevo_semantic_root, fid + ".npy" )
+ fpath_semantic_list.append(fpath_semantic)
+
+ checkFile_semantic = True
+ for fpath_semantic in fpath_semantic_list:
+ if not os.path.exists(fpath_semantic):
+ checkFile_semantic = False
+
+ checkFile_chord = os.path.exists(fpath_chord)
+ checkFile_emotion = os.path.exists(fpath_emotion)
+ checkFile_motion = os.path.exists(fpath_motion)
+ checkFile_scene_offset = os.path.exists(fpath_scene_offset)
+
+ checkFile_loudness = os.path.exists(fpath_loudness)
+ checkFile_note_density = os.path.exists(fpath_note_density)
+
+ if checkFile_chord and checkFile_emotion and checkFile_motion \
+ and checkFile_scene_offset and checkFile_semantic and checkFile_loudness and checkFile_note_density :
+
+ self.data_files_chord.append(fpath_chord)
+ self.data_files_emotion.append(fpath_emotion)
+ self.data_files_motion.append(fpath_motion)
+ self.data_files_scene_offset.append(fpath_scene_offset)
+
+ self.data_files_loudness.append(fpath_loudness)
+ self.data_files_note_density.append(fpath_note_density)
+
+ if IS_VIDEO:
+ for i in range(len(self.vis_models_arr)):
+ self.data_files_semantic_list[i].append( fpath_semantic_list[i] )
+
+ chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+
+ chordRootDicPath = os.path.join( dataset_root, "vevo_meta/chord_root.json")
+ chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+
+ with open(chordDicPath) as json_file:
+ self.chordDic = json.load(json_file)
+
+ with open(chordRootDicPath) as json_file:
+ self.chordRootDic = json.load(json_file)
+
+ with open(chordAttrDicPath) as json_file:
+ self.chordAttrDic = json.load(json_file)
+
+ def __len__(self):
+ return len(self.data_files_chord)
+
+ def __getitem__(self, idx):
+ #### ---- CHORD ----- ####
+ feature_chord = np.empty(self.max_seq_chord)
+ feature_chord.fill(CHORD_PAD)
+
+ feature_chordRoot = np.empty(self.max_seq_chord)
+ feature_chordRoot.fill(CHORD_ROOT_PAD)
+ feature_chordAttr = np.empty(self.max_seq_chord)
+ feature_chordAttr.fill(CHORD_ATTR_PAD)
+
+ key = ""
+ with open(self.data_files_chord[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ if line_arr[0] == "key":
+ key = line_arr[1] + " "+ line_arr[2]
+ continue
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+ chord = line_arr[1]
+ chordID = self.chordDic[chord]
+ feature_chord[time] = chordID
+ chord_arr = chord.split(":")
+
+ if len(chord_arr) == 1:
+ if chord_arr[0] == "N":
+ chordRootID = self.chordRootDic["N"]
+ chordAttrID = self.chordAttrDic["N"]
+ feature_chordRoot[time] = chordRootID
+ feature_chordAttr[time] = chordAttrID
+ else:
+ chordRootID = self.chordRootDic[chord_arr[0]]
+ feature_chordRoot[time] = chordRootID
+ feature_chordAttr[time] = 1
+ elif len(chord_arr) == 2:
+ chordRootID = self.chordRootDic[chord_arr[0]]
+ chordAttrID = self.chordAttrDic[chord_arr[1]]
+ feature_chordRoot[time] = chordRootID
+ feature_chordAttr[time] = chordAttrID
+
+ if "major" in key:
+ feature_key = torch.tensor([0])
+ else:
+ feature_key = torch.tensor([1])
+
+ feature_chord = torch.from_numpy(feature_chord)
+ feature_chord = feature_chord.to(torch.long)
+
+ feature_chordRoot = torch.from_numpy(feature_chordRoot)
+ feature_chordRoot = feature_chordRoot.to(torch.long)
+
+ feature_chordAttr = torch.from_numpy(feature_chordAttr)
+ feature_chordAttr = feature_chordAttr.to(torch.long)
+
+ feature_key = feature_key.float()
+
+ x = feature_chord[:self.max_seq_chord-1]
+ tgt = feature_chord[1:self.max_seq_chord]
+
+ x_root = feature_chordRoot[:self.max_seq_chord-1]
+ tgt_root = feature_chordRoot[1:self.max_seq_chord]
+ x_attr = feature_chordAttr[:self.max_seq_chord-1]
+ tgt_attr = feature_chordAttr[1:self.max_seq_chord]
+
+ if time < self.max_seq_chord:
+ tgt[time] = CHORD_END
+ tgt_root[time] = CHORD_ROOT_END
+ tgt_attr[time] = CHORD_ATTR_END
+
+ #### ---- SCENE OFFSET ----- ####
+ feature_scene_offset = np.empty(self.max_seq_video)
+ feature_scene_offset.fill(SCENE_OFFSET_PAD)
+ with open(self.data_files_scene_offset[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+ sceneID = line_arr[1]
+ feature_scene_offset[time] = int(sceneID)+1
+
+ feature_scene_offset = torch.from_numpy(feature_scene_offset)
+ feature_scene_offset = feature_scene_offset.to(torch.float32)
+
+ #### ---- MOTION ----- ####
+ feature_motion = np.empty(self.max_seq_video)
+ feature_motion.fill(MOTION_PAD)
+ with open(self.data_files_motion[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+ motion = line_arr[1]
+ feature_motion[time] = float(motion)
+
+ feature_motion = torch.from_numpy(feature_motion)
+ feature_motion = feature_motion.to(torch.float32)
+
+ #### ---- NOTE_DENSITY ----- ####
+ feature_note_density = np.empty(self.max_seq_video)
+ feature_note_density.fill(NOTE_DENSITY_PAD)
+ with open(self.data_files_note_density[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+ note_density = line_arr[1]
+ feature_note_density[time] = float(note_density)
+
+ feature_note_density = torch.from_numpy(feature_note_density)
+ feature_note_density = feature_note_density.to(torch.float32)
+
+ #### ---- LOUDNESS ----- ####
+ feature_loudness = np.empty(self.max_seq_video)
+ feature_loudness.fill(LOUDNESS_PAD)
+ with open(self.data_files_loudness[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+ loudness = line_arr[1]
+ feature_loudness[time] = float(loudness)
+
+ feature_loudness = torch.from_numpy(feature_loudness)
+ feature_loudness = feature_loudness.to(torch.float32)
+
+ #### ---- EMOTION ----- ####
+ if self.emo_model.startswith("6c"):
+ feature_emotion = np.empty( (self.max_seq_video, 6))
+ else:
+ feature_emotion = np.empty( (self.max_seq_video, 5))
+
+ feature_emotion.fill(EMOTION_PAD)
+ with open(self.data_files_emotion[idx], encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ if line_arr[0] == "time":
+ continue
+ time = line_arr[0]
+ time = int(time)
+ if time >= self.max_seq_chord:
+ break
+
+ if len(line_arr) == 7:
+ emo1, emo2, emo3, emo4, emo5, emo6 = \
+ line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5],line_arr[6]
+ emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5), float(emo6) ]
+ elif len(line_arr) == 6:
+ emo1, emo2, emo3, emo4, emo5 = \
+ line_arr[1],line_arr[2],line_arr[3],line_arr[4],line_arr[5]
+ emoList = [ float(emo1), float(emo2), float(emo3), float(emo4), float(emo5) ]
+
+ emoList = np.array(emoList)
+ feature_emotion[time] = emoList
+
+ feature_emotion = torch.from_numpy(feature_emotion)
+ feature_emotion = feature_emotion.to(torch.float32)
+
+ feature_emotion_argmax = torch.argmax(feature_emotion, dim=1)
+ _, max_prob_indices = torch.max(feature_emotion, dim=1)
+ max_prob_values = torch.gather(feature_emotion, dim=1, index=max_prob_indices.unsqueeze(1))
+ max_prob_values = max_prob_values.squeeze()
+
+ # -- emotion to chord
+ # maj dim sus4 min7 min sus2 aug dim7 maj6 hdim7 7 min6 maj7
+ # 0. extcing : [1,0,1,0,0,0,0,0,0,0,1,0,0]
+ # 1. fearful : [0,1,0,1,0,0,0,1,0,1,0,0,0]
+ # 2. tense : [0,1,1,1,0,0,0,0,0,0,1,0,0]
+ # 3. sad : [0,0,0,1,1,1,0,0,0,0,0,0,0]
+ # 4. relaxing: [1,0,0,0,0,0,0,0,1,0,0,0,1]
+ # 5. neutral : [0,0,0,0,0,0,0,0,0,0,0,0,0]
+
+ a0 = [0]+[1,0,1,0,0,0,0,0,0,0,1,0,0]*12+[0,0]
+ a1 = [0]+[0,1,0,1,0,0,0,1,0,1,0,0,0]*12+[0,0]
+ a2 = [0]+[0,1,1,1,0,0,0,0,0,0,1,0,0]*12+[0,0]
+ a3 = [0]+[0,0,0,1,1,1,0,0,0,0,0,0,0]*12+[0,0]
+ a4 = [0]+[1,0,0,0,0,0,0,0,1,0,0,0,1]*12+[0,0]
+ a5 = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,0]
+
+ aend = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[1,0]
+ apad = [0]+[0,0,0,0,0,0,0,0,0,0,0,0,0]*12+[0,1]
+
+ a0_tensor = torch.tensor(a0)
+ a1_tensor = torch.tensor(a1)
+ a2_tensor = torch.tensor(a2)
+ a3_tensor = torch.tensor(a3)
+ a4_tensor = torch.tensor(a4)
+ a5_tensor = torch.tensor(a5)
+
+ aend_tensor = torch.tensor(aend)
+ apad_tensor = torch.tensor(apad)
+
+ mapped_tensor = torch.zeros((300, 159))
+ for i, val in enumerate(feature_emotion_argmax):
+ if feature_chord[i] == CHORD_PAD:
+ mapped_tensor[i] = apad_tensor
+ elif feature_chord[i] == CHORD_END:
+ mapped_tensor[i] = aend_tensor
+ elif val == 0:
+ mapped_tensor[i] = a0_tensor
+ elif val == 1:
+ mapped_tensor[i] = a1_tensor
+ elif val == 2:
+ mapped_tensor[i] = a2_tensor
+ elif val == 3:
+ mapped_tensor[i] = a3_tensor
+ elif val == 4:
+ mapped_tensor[i] = a4_tensor
+ elif val == 5:
+ mapped_tensor[i] = a5_tensor
+
+ # feature emotion : [1, 300, 6]
+ # y : [299, 159]
+ # tgt : [299]
+ # tgt_emo : [299, 159]
+ # tgt_emo_prob : [299]
+
+ tgt_emotion = mapped_tensor[1:]
+ tgt_emotion_prob = max_prob_values[1:]
+
+ feature_semantic_list = []
+ if self.is_video:
+ for i in range( len(self.vis_models_arr) ):
+ video_feature = np.load(self.data_files_semantic_list[i][idx])
+ dim_vf = video_feature.shape[1] # 2048
+ video_feature_tensor = torch.from_numpy( video_feature )
+
+ feature_semantic = torch.full((self.max_seq_video, dim_vf,), SEMANTIC_PAD , dtype=torch.float32, device=cpu_device())
+ if(video_feature_tensor.shape[0] < self.max_seq_video):
+ feature_semantic[:video_feature_tensor.shape[0]] = video_feature_tensor
+ else:
+ feature_semantic = video_feature_tensor[:self.max_seq_video]
+ feature_semantic_list.append(feature_semantic)
+
+ return { "x":x,
+ "tgt":tgt,
+ "x_root":x_root,
+ "tgt_root":tgt_root,
+ "x_attr":x_attr,
+ "tgt_attr":tgt_attr,
+ "semanticList": feature_semantic_list,
+ "key": feature_key,
+ "scene_offset": feature_scene_offset,
+ "motion": feature_motion,
+ "emotion": feature_emotion,
+ "tgt_emotion" : tgt_emotion,
+ "tgt_emotion_prob" : tgt_emotion_prob,
+ "note_density" : feature_note_density,
+ "loudness" : feature_loudness
+ }
+
+def create_vevo_datasets(dataset_root = "./dataset", max_seq_chord=300, max_seq_video=300, vis_models="2d/clip_l14p", emo_model="6c_l14p", split_ver="v1", random_seq=True, is_video=True):
+
+ train_dataset = VevoDataset(
+ dataset_root = dataset_root, split="train", split_ver=split_ver,
+ vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video,
+ random_seq=random_seq, is_video = is_video )
+
+ val_dataset = VevoDataset(
+ dataset_root = dataset_root, split="val", split_ver=split_ver,
+ vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video,
+ random_seq=random_seq, is_video = is_video )
+
+ test_dataset = VevoDataset(
+ dataset_root = dataset_root, split="test", split_ver=split_ver,
+ vis_models=vis_models, emo_model =emo_model, max_seq_chord=max_seq_chord, max_seq_video=max_seq_video,
+ random_seq=random_seq, is_video = is_video )
+
+ return train_dataset, val_dataset, test_dataset
+
+def compute_vevo_accuracy(out, tgt):
+ softmax = nn.Softmax(dim=-1)
+ out = torch.argmax(softmax(out), dim=-1)
+
+ out = out.flatten()
+ tgt = tgt.flatten()
+
+ mask = (tgt != CHORD_PAD)
+
+ out = out[mask]
+ tgt = tgt[mask]
+
+ if(len(tgt) == 0):
+ return 1.0
+
+ num_right = (out == tgt)
+ num_right = torch.sum(num_right).type(TORCH_FLOAT)
+
+ acc = num_right / len(tgt)
+
+ return acc
+
+def compute_hits_k(out, tgt, k):
+ softmax = nn.Softmax(dim=-1)
+ out = softmax(out)
+ _, topk_indices = torch.topk(out, k, dim=-1) # Get the indices of top-k values
+
+ tgt = tgt.flatten()
+
+ topk_indices = torch.squeeze(topk_indices, dim = 0)
+
+ num_right = 0
+ pt = 0
+ for i, tlist in enumerate(topk_indices):
+ if tgt[i] == CHORD_PAD:
+ num_right += 0
+ else:
+ pt += 1
+ if tgt[i].item() in tlist:
+ num_right += 1
+
+ # Empty
+ if len(tgt) == 0:
+ return 1.0
+
+ num_right = torch.tensor(num_right, dtype=torch.float32)
+ hitk = num_right / pt
+
+ return hitk
+
+def compute_hits_k_root_attr(out_root, out_attr, tgt, k):
+ softmax = nn.Softmax(dim=-1)
+ out_root = softmax(out_root)
+ out_attr = softmax(out_attr)
+
+ tensor_shape = torch.Size([1, 299, 159])
+ out = torch.zeros(tensor_shape)
+ for i in range(out.shape[-1]):
+ if i == 0 :
+ out[0, :, i] = out_root[0, :, 0] * out_attr[0, :, 0]
+ elif i == 157:
+ out[0, :, i] = out_root[0, :, 13] * out_attr[0, :, 14]
+ elif i == 158:
+ out[0, :, i] = out_root[0, :, 14] * out_attr[0, :, 15]
+ else:
+ rootindex = int( (i-1)/13 ) + 1
+ attrindex = (i-1)%13 + 1
+ out[0, :, i] = out_root[0, :, rootindex] * out_attr[0, :, attrindex]
+
+ out = softmax(out)
+ _, topk_indices = torch.topk(out, k, dim=-1) # Get the indices of top-k values
+
+ tgt = tgt.flatten()
+
+ topk_indices = torch.squeeze(topk_indices, dim = 0)
+
+ num_right = 0
+ pt = 0
+ for i, tlist in enumerate(topk_indices):
+ if tgt[i] == CHORD_PAD:
+ num_right += 0
+ else:
+ pt += 1
+ if tgt[i].item() in tlist:
+ num_right += 1
+
+ if len(tgt) == 0:
+ return 1.0
+
+ num_right = torch.tensor(num_right, dtype=torch.float32)
+ hitk = num_right / pt
+
+ return hitk
+
+def compute_vevo_correspondence(out, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold):
+
+ tgt_emotion = tgt_emotion.squeeze()
+ tgt_emotion_prob = tgt_emotion_prob.squeeze()
+
+ dataset_root = "./dataset/"
+ chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+ chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+ chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+
+ chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+ chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json")
+
+ with open(chordRootInvDicPath) as json_file:
+ chordRootInvDic = json.load(json_file)
+ with open(chordAttrDicPath) as json_file:
+ chordAttrDic = json.load(json_file)
+ with open(chordAttrInvDicPath) as json_file:
+ chordAttrInvDic = json.load(json_file)
+ with open(chordDicPath) as json_file:
+ chordDic = json.load(json_file)
+ with open(chordInvDicPath) as json_file:
+ chordInvDic = json.load(json_file)
+
+ softmax = nn.Softmax(dim=-1)
+ out = torch.argmax(softmax(out), dim=-1)
+ out = out.flatten()
+
+ tgt = tgt.flatten()
+
+ num_right = 0
+ tgt_emotion_quality = tgt_emotion[:, 0:14]
+ pt = 0
+ for i, out_element in enumerate( out ):
+
+ all_zeros = torch.all(tgt_emotion_quality[i] == 0)
+ if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold:
+ num_right += 0
+ else:
+ pt += 1
+ if out_element.item() != CHORD_END and out_element.item() != CHORD_PAD:
+ gen_chord = chordInvDic[ str( out_element.item() ) ]
+
+ chord_arr = gen_chord.split(":")
+ if len(chord_arr) == 1:
+ out_quality = 1
+ elif len(chord_arr) == 2:
+ chordAttrID = chordAttrDic[chord_arr[1]]
+ out_quality = chordAttrID # 0:N, 1:maj ... 13:maj7
+
+ if tgt_emotion_quality[i][out_quality] == 1:
+ num_right += 1
+
+
+ if(len(tgt_emotion) == 0):
+ return 1.0
+
+ if(pt == 0):
+ return -1
+
+ num_right = torch.tensor(num_right, dtype=torch.float32)
+ acc = num_right / pt
+
+ return acc
+
+def compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, emotion_threshold):
+
+ tgt_emotion = tgt_emotion.squeeze()
+ tgt_emotion_prob = tgt_emotion_prob.squeeze()
+
+ dataset_root = "./dataset/"
+ chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+ chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+ chordAttrDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr.json")
+
+ chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+ chordInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_inv.json")
+
+ with open(chordRootInvDicPath) as json_file:
+ chordRootInvDic = json.load(json_file)
+ with open(chordAttrDicPath) as json_file:
+ chordAttrDic = json.load(json_file)
+ with open(chordAttrInvDicPath) as json_file:
+ chordAttrInvDic = json.load(json_file)
+ with open(chordDicPath) as json_file:
+ chordDic = json.load(json_file)
+ with open(chordInvDicPath) as json_file:
+ chordInvDic = json.load(json_file)
+
+ softmax = nn.Softmax(dim=-1)
+
+ y_root = torch.argmax(softmax(y_root), dim=-1)
+ y_attr = torch.argmax(softmax(y_attr), dim=-1)
+
+ y_root = y_root.flatten()
+ y_attr = y_attr.flatten()
+
+ tgt = tgt.flatten()
+ y = np.empty( len(tgt) )
+
+ y.fill(CHORD_PAD)
+
+ for i in range(len(tgt)):
+ if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD:
+ y[i] = CHORD_PAD
+ elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END:
+ y[i] = CHORD_END
+ else:
+ chordRoot = chordRootInvDic[str(y_root[i].item())]
+ chordAttr = chordAttrInvDic[str(y_attr[i].item())]
+ if chordRoot == "N":
+ y[i] = 0
+ else:
+ if chordAttr == "N" or chordAttr == "maj":
+ y[i] = chordDic[chordRoot]
+ else:
+ chord = chordRoot + ":" + chordAttr
+ y[i] = chordDic[chord]
+
+ y = torch.from_numpy(y)
+ y = y.to(torch.long)
+ y = y.to(get_device())
+ y = y.flatten()
+
+ num_right = 0
+ tgt_emotion_quality = tgt_emotion[:, 0:14]
+ pt = 0
+ for i, y_element in enumerate( y ):
+ all_zeros = torch.all(tgt_emotion_quality[i] == 0)
+ if tgt_emotion[i][-1] == 1 or all_zeros or tgt_emotion_prob[i] < emotion_threshold:
+ num_right += 0
+ else:
+ pt += 1
+ if y_element.item() != CHORD_END and y_element.item() != CHORD_PAD:
+ gen_chord = chordInvDic[ str( y_element.item() ) ]
+ chord_arr = gen_chord.split(":")
+ if len(chord_arr) == 1:
+ y_quality = 1
+ elif len(chord_arr) == 2:
+ chordAttrID = chordAttrDic[chord_arr[1]]
+ y_quality = chordAttrID # 0:N, 1:maj ... 13:maj7
+
+ if tgt_emotion_quality[i][y_quality] == 1:
+ num_right += 1
+
+ if(len(tgt_emotion) == 0):
+ return 1.0
+
+ if(pt == 0):
+ return -1
+
+ num_right = torch.tensor(num_right, dtype=torch.float32)
+ acc = num_right / pt
+ return acc
+
+def compute_vevo_accuracy_root_attr(y_root, y_attr, tgt):
+
+ dataset_root = "./dataset/"
+ chordRootInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_root_inv.json")
+ chordAttrInvDicPath = os.path.join( dataset_root, "vevo_meta/chord_attr_inv.json")
+ chordDicPath = os.path.join( dataset_root, "vevo_meta/chord.json")
+
+ with open(chordRootInvDicPath) as json_file:
+ chordRootInvDic = json.load(json_file)
+ with open(chordAttrInvDicPath) as json_file:
+ chordAttrInvDic = json.load(json_file)
+ with open(chordDicPath) as json_file:
+ chordDic = json.load(json_file)
+
+ softmax = nn.Softmax(dim=-1)
+
+ y_root = torch.argmax(softmax(y_root), dim=-1)
+ y_attr = torch.argmax(softmax(y_attr), dim=-1)
+
+ y_root = y_root.flatten()
+ y_attr = y_attr.flatten()
+
+ tgt = tgt.flatten()
+
+ mask = (tgt != CHORD_PAD)
+ y = np.empty( len(tgt) )
+ y.fill(CHORD_PAD)
+
+ for i in range(len(tgt)):
+ if y_root[i].item() == CHORD_ROOT_PAD or y_attr[i].item() == CHORD_ATTR_PAD:
+ y[i] = CHORD_PAD
+ elif y_root[i].item() == CHORD_ROOT_END or y_attr[i].item() == CHORD_ATTR_END:
+ y[i] = CHORD_END
+ else:
+ chordRoot = chordRootInvDic[str(y_root[i].item())]
+ chordAttr = chordAttrInvDic[str(y_attr[i].item())]
+ if chordRoot == "N":
+ y[i] = 0
+ else:
+ if chordAttr == "N" or chordAttr == "maj":
+ y[i] = chordDic[chordRoot]
+ else:
+ chord = chordRoot + ":" + chordAttr
+ y[i] = chordDic[chord]
+
+ y = torch.from_numpy(y)
+ y = y.to(torch.long)
+ y = y.to(get_device())
+
+ y = y[mask]
+ tgt = tgt[mask]
+
+ # Empty
+ if(len(tgt) == 0):
+ return 1.0
+
+ num_right = (y == tgt)
+ num_right = torch.sum(num_right).type(TORCH_FLOAT)
+
+ acc = num_right / len(tgt)
+
+ return acc
+
diff --git a/dataset/vevo_meta/chord.json b/dataset/vevo_meta/chord.json
new file mode 100644
index 0000000000000000000000000000000000000000..cafa2d8e0f8e842773de8b1e9fe9c0313342b4ae
--- /dev/null
+++ b/dataset/vevo_meta/chord.json
@@ -0,0 +1 @@
+{"N": 0, "C": 1, "C:dim": 2, "C:sus4": 3, "C:min7": 4, "C:min": 5, "C:sus2": 6, "C:aug": 7, "C:dim7": 8, "C:maj6": 9, "C:hdim7": 10, "C:7": 11, "C:min6": 12, "C:maj7": 13, "C#": 14, "C#:dim": 15, "C#:sus4": 16, "C#:min7": 17, "C#:min": 18, "C#:sus2": 19, "C#:aug": 20, "C#:dim7": 21, "C#:maj6": 22, "C#:hdim7": 23, "C#:7": 24, "C#:min6": 25, "C#:maj7": 26, "D": 27, "D:dim": 28, "D:sus4": 29, "D:min7": 30, "D:min": 31, "D:sus2": 32, "D:aug": 33, "D:dim7": 34, "D:maj6": 35, "D:hdim7": 36, "D:7": 37, "D:min6": 38, "D:maj7": 39, "D#": 40, "D#:dim": 41, "D#:sus4": 42, "D#:min7": 43, "D#:min": 44, "D#:sus2": 45, "D#:aug": 46, "D#:dim7": 47, "D#:maj6": 48, "D#:hdim7": 49, "D#:7": 50, "D#:min6": 51, "D#:maj7": 52, "E": 53, "E:dim": 54, "E:sus4": 55, "E:min7": 56, "E:min": 57, "E:sus2": 58, "E:aug": 59, "E:dim7": 60, "E:maj6": 61, "E:hdim7": 62, "E:7": 63, "E:min6": 64, "E:maj7": 65, "F": 66, "F:dim": 67, "F:sus4": 68, "F:min7": 69, "F:min": 70, "F:sus2": 71, "F:aug": 72, "F:dim7": 73, "F:maj6": 74, "F:hdim7": 75, "F:7": 76, "F:min6": 77, "F:maj7": 78, "F#": 79, "F#:dim": 80, "F#:sus4": 81, "F#:min7": 82, "F#:min": 83, "F#:sus2": 84, "F#:aug": 85, "F#:dim7": 86, "F#:maj6": 87, "F#:hdim7": 88, "F#:7": 89, "F#:min6": 90, "F#:maj7": 91, "G": 92, "G:dim": 93, "G:sus4": 94, "G:min7": 95, "G:min": 96, "G:sus2": 97, "G:aug": 98, "G:dim7": 99, "G:maj6": 100, "G:hdim7": 101, "G:7": 102, "G:min6": 103, "G:maj7": 104, "G#": 105, "G#:dim": 106, "G#:sus4": 107, "G#:min7": 108, "G#:min": 109, "G#:sus2": 110, "G#:aug": 111, "G#:dim7": 112, "G#:maj6": 113, "G#:hdim7": 114, "G#:7": 115, "G#:min6": 116, "G#:maj7": 117, "A": 118, "A:dim": 119, "A:sus4": 120, "A:min7": 121, "A:min": 122, "A:sus2": 123, "A:aug": 124, "A:dim7": 125, "A:maj6": 126, "A:hdim7": 127, "A:7": 128, "A:min6": 129, "A:maj7": 130, "A#": 131, "A#:dim": 132, "A#:sus4": 133, "A#:min7": 134, "A#:min": 135, "A#:sus2": 136, "A#:aug": 137, "A#:dim7": 138, "A#:maj6": 139, "A#:hdim7": 140, "A#:7": 141, "A#:min6": 142, "A#:maj7": 143, "B": 144, "B:dim": 145, "B:sus4": 146, "B:min7": 147, "B:min": 148, "B:sus2": 149, "B:aug": 150, "B:dim7": 151, "B:maj6": 152, "B:hdim7": 153, "B:7": 154, "B:min6": 155, "B:maj7": 156}
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_attr.json b/dataset/vevo_meta/chord_attr.json
new file mode 100644
index 0000000000000000000000000000000000000000..f55cf806aa5f034812a38385f3c6cff21f8e617d
--- /dev/null
+++ b/dataset/vevo_meta/chord_attr.json
@@ -0,0 +1 @@
+{"N": 0, "maj": 1, "dim": 2, "sus4": 3, "min7": 4, "min": 5, "sus2": 6, "aug": 7, "dim7": 8, "maj6": 9, "hdim7": 10, "7": 11, "min6": 12, "maj7": 13}
diff --git a/dataset/vevo_meta/chord_attr_inv.json b/dataset/vevo_meta/chord_attr_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f355aaa2c26e0c141f64057f17a054a608d4d32
--- /dev/null
+++ b/dataset/vevo_meta/chord_attr_inv.json
@@ -0,0 +1,16 @@
+{
+ "0": "N",
+ "1": "maj",
+ "2": "dim",
+ "3": "sus4",
+ "4": "min7",
+ "5": "min",
+ "6": "sus2",
+ "7": "aug",
+ "8": "dim7",
+ "9": "maj6",
+ "10": "hdim7",
+ "11": "7",
+ "12": "min6",
+ "13": "maj7"
+}
diff --git a/dataset/vevo_meta/chord_inv.json b/dataset/vevo_meta/chord_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3dc21d8972ff10457557821481a94b16cdf5936
--- /dev/null
+++ b/dataset/vevo_meta/chord_inv.json
@@ -0,0 +1 @@
+{"0": "N", "1": "C", "2": "C:dim", "3": "C:sus4", "4": "C:min7", "5": "C:min", "6": "C:sus2", "7": "C:aug", "8": "C:dim7", "9": "C:maj6", "10": "C:hdim7", "11": "C:7", "12": "C:min6", "13": "C:maj7", "14": "C#", "15": "C#:dim", "16": "C#:sus4", "17": "C#:min7", "18": "C#:min", "19": "C#:sus2", "20": "C#:aug", "21": "C#:dim7", "22": "C#:maj6", "23": "C#:hdim7", "24": "C#:7", "25": "C#:min6", "26": "C#:maj7", "27": "D", "28": "D:dim", "29": "D:sus4", "30": "D:min7", "31": "D:min", "32": "D:sus2", "33": "D:aug", "34": "D:dim7", "35": "D:maj6", "36": "D:hdim7", "37": "D:7", "38": "D:min6", "39": "D:maj7", "40": "D#", "41": "D#:dim", "42": "D#:sus4", "43": "D#:min7", "44": "D#:min", "45": "D#:sus2", "46": "D#:aug", "47": "D#:dim7", "48": "D#:maj6", "49": "D#:hdim7", "50": "D#:7", "51": "D#:min6", "52": "D#:maj7", "53": "E", "54": "E:dim", "55": "E:sus4", "56": "E:min7", "57": "E:min", "58": "E:sus2", "59": "E:aug", "60": "E:dim7", "61": "E:maj6", "62": "E:hdim7", "63": "E:7", "64": "E:min6", "65": "E:maj7", "66": "F", "67": "F:dim", "68": "F:sus4", "69": "F:min7", "70": "F:min", "71": "F:sus2", "72": "F:aug", "73": "F:dim7", "74": "F:maj6", "75": "F:hdim7", "76": "F:7", "77": "F:min6", "78": "F:maj7", "79": "F#", "80": "F#:dim", "81": "F#:sus4", "82": "F#:min7", "83": "F#:min", "84": "F#:sus2", "85": "F#:aug", "86": "F#:dim7", "87": "F#:maj6", "88": "F#:hdim7", "89": "F#:7", "90": "F#:min6", "91": "F#:maj7", "92": "G", "93": "G:dim", "94": "G:sus4", "95": "G:min7", "96": "G:min", "97": "G:sus2", "98": "G:aug", "99": "G:dim7", "100": "G:maj6", "101": "G:hdim7", "102": "G:7", "103": "G:min6", "104": "G:maj7", "105": "G#", "106": "G#:dim", "107": "G#:sus4", "108": "G#:min7", "109": "G#:min", "110": "G#:sus2", "111": "G#:aug", "112": "G#:dim7", "113": "G#:maj6", "114": "G#:hdim7", "115": "G#:7", "116": "G#:min6", "117": "G#:maj7", "118": "A", "119": "A:dim", "120": "A:sus4", "121": "A:min7", "122": "A:min", "123": "A:sus2", "124": "A:aug", "125": "A:dim7", "126": "A:maj6", "127": "A:hdim7", "128": "A:7", "129": "A:min6", "130": "A:maj7", "131": "A#", "132": "A#:dim", "133": "A#:sus4", "134": "A#:min7", "135": "A#:min", "136": "A#:sus2", "137": "A#:aug", "138": "A#:dim7", "139": "A#:maj6", "140": "A#:hdim7", "141": "A#:7", "142": "A#:min6", "143": "A#:maj7", "144": "B", "145": "B:dim", "146": "B:sus4", "147": "B:min7", "148": "B:min", "149": "B:sus2", "150": "B:aug", "151": "B:dim7", "152": "B:maj6", "153": "B:hdim7", "154": "B:7", "155": "B:min6", "156": "B:maj7"}
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_root.json b/dataset/vevo_meta/chord_root.json
new file mode 100644
index 0000000000000000000000000000000000000000..f80daf81c00ab965bbb09b5a3424bf828b3be1f7
--- /dev/null
+++ b/dataset/vevo_meta/chord_root.json
@@ -0,0 +1 @@
+{"N": 0, "C": 1, "C#": 2, "D": 3, "D#": 4, "E": 5, "F": 6, "F#": 7, "G": 8, "G#": 9, "A": 10, "A#": 11, "B": 12}
\ No newline at end of file
diff --git a/dataset/vevo_meta/chord_root_inv.json b/dataset/vevo_meta/chord_root_inv.json
new file mode 100644
index 0000000000000000000000000000000000000000..9febc158c8b1aba9899c06b4aec88b4a7e7b6543
--- /dev/null
+++ b/dataset/vevo_meta/chord_root_inv.json
@@ -0,0 +1,15 @@
+{
+ "0": "N",
+ "1": "C",
+ "2": "C#",
+ "3": "D",
+ "4": "D#",
+ "5": "E",
+ "6": "F",
+ "7": "F#",
+ "8": "G",
+ "9": "G#",
+ "10": "A",
+ "11": "A#",
+ "12": "B"
+}
\ No newline at end of file
diff --git a/dataset/vevo_meta/exclude.txt b/dataset/vevo_meta/exclude.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13ef0a79b8787de65a0c008583ab7dd8a4a47fd0
--- /dev/null
+++ b/dataset/vevo_meta/exclude.txt
@@ -0,0 +1 @@
+453
\ No newline at end of file
diff --git a/dataset/vevo_meta/idlist.txt b/dataset/vevo_meta/idlist.txt
new file mode 100644
index 0000000000000000000000000000000000000000..620ed37e15a6984c0284408eaa45da4b35b8ae38
--- /dev/null
+++ b/dataset/vevo_meta/idlist.txt
@@ -0,0 +1,748 @@
+001-Luis Fonsi - Despacito ft. Daddy Yankee kJQP7kiw5Fk
+002-Mark Ronson - Uptown Funk (Official Video) ft. Bruno Mars OPf0YbXqDm0
+003-Maroon 5 - Sugar (Official Music Video) 09R8_2nJtjg
+004-Justin Bieber - Sorry (PURPOSE - The Movement) fRh_vgS2dFE
+005-Katy Perry - Roar (Official) CevxZvSJLk8
+006-OneRepublic - Counting Stars (Official Music Video) hT_nvWreIhg
+007-Katy Perry - Dark Horse (Official) ft. Juicy J 0KSOMA3QBU0
+008-Crazy Frog - Axel F (Official Video) k85mRPqvMbE
+009-Enrique Iglesias - Bailando ft. Descemer Bueno, Gente De Zona (Español) NUsoVlDFqZg
+010-Taylor Swift - Shake It Off nfWlot6h_JM
+011-J Balvin, Willy William - Mi Gente (Official Video) wnJ6LuUFpMo
+012-Shakira - Waka Waka (This Time for Africa) (The Official 2010 FIFA World Cup™ Song) pRpeEdMmmQ0
+013-Adele - Hello YQHsXMglC9A
+014-Taylor Swift - Blank Space e-ORhEE9VVg
+016-Shakira - Chantaje (Official Video) ft. Maluma 6Mgqbai3fKo
+017-Justin Bieber - Baby (Official Music Video) ft. Ludacris kffacxfA7G4
+018-Calvin Harris - This Is What You Came For (Official Video) ft. Rihanna kOkQ4T5WO9E
+019-Fifth Harmony - Work from Home (Official Video) ft. Ty Dolla $ign 5GL9JoH4Sws
+020-Meghan Trainor - All About That Bass 7PCkvCPvDXk
+021-Sia - Chandelier (Official Video) 2vjPBrBU-TM
+022-Eminem - Love The Way You Lie ft. Rihanna uelHwf8o7_U
+023-Ellie Goulding - Love Me Like You Do (Official Video) AJtDXIazrMo
+024-Shawn Mendes - Treat You Better lY2yjAdbvdQ
+025-Justin Bieber - What Do You Mean (Official Music Video) DK_0jXPuIr0
+026-MAGIC! - Rude (Official Video) PIh2xe4jnpk
+027-Luis Fonsi, Demi Lovato - Échame La Culpa (Video Oficial) TyHvyGVs42U
+028-Avicii - Wake Me Up (Official Video) IcrbM1l_BoI
+029-LMFAO ft. Lauren Bennett, GoonRock - Party Rock Anthem (Official Video) KQ6zr6kCPj8
+030-Imagine Dragons - Believer 7wtfhZwyrcc
+031-Becky G, Bad Bunny - Mayores (Official Video) GMFewiplIbw
+032-John Legend - All of Me (Official Video) 450p7goxZqg
+033-Fifth Harmony - Worth It (Official Video) ft. Kid Ink YBHQbu5rbdQ
+035-The Weeknd - Starboy ft. Daft Punk (Official Video) 34Na4j8AVgA
+036-Ariana Grande ft. Nicki Minaj - Side To Side (Official Video) ft. Nicki Minaj SXiSVQZLje8
+037-Adele - Rolling in the Deep (Official Music Video) rYEDA3JcQqw
+038-Rihanna - Diamonds lWA2pjMjpBs
+039-Jennifer Lopez - On The Floor ft. Pitbull t4H_Zoh7G5A
+041-Silentó - Watch Me (Whip_Nae Nae) (Official) vjW8wmF5VWc
+042-Romeo Santos - Propuesta Indecente (Official Video) QFs3PIZb3js
+043-J. Balvin - Ay Vamos (Official Video) TapXs54Ah3E
+044-Adele - Someone Like You (Official Music Video) hLQl3WQQoQ0
+045-Drake - Hotline Bling uxpDa-c-4Mc
+046-Guns N' Roses - November Rain 8SbUC-UaAxE
+047-ZAYN - Dusk Till Dawn (Official Video) ft. Sia tt2k8PGm-TI
+048-The Chainsmokers - Don't Let Me Down (Official Video) ft. Daya Io0fBr1XBUA
+049-The Weeknd - The Hills (Official Video) yzTuBuRdAyA
+050-Imagine Dragons - Thunder fKopy74weus
+051-Jessie J, Ariana Grande, Nicki Minaj - Bang Bang (Official Video) 0HDdjwpPM3Y
+052-Ricky Martin - Vente Pa' Ca (Official Video) ft. Maluma iOe6dI2JhgU
+054-CNCO - Reggaetón Lento (Bailemos) 7jpqqBX-Myw
+055-Chino y Nacho - Andas En Mi Cabeza ft. Daddy Yankee (Video Oficial) AMTAQ-AJS4Y
+056-Justin Bieber - Love Yourself (Official Music Video) oyEuk8j8imI
+057-DJ Khaled - I'm The One ft. Justin Bieber, Quavo, Chance the Rapper, Lil Wayne weeI1G46q0o
+058-Eminem - Not Afraid (Official Video) j5-yKhDd64s
+059-Calvin Harris - Summer (Official Video) ebXbLfLACGM
+060-CAN'T STOP THE FEELING! (from DreamWorks Animation's 'TROLLS') (Official Video) ru0K8uYEZWw
+061-Lady Gaga - Bad Romance (Official Music Video) qrO4YZeyl0I
+062-Carlos Vives, Sebastián Yatra - Robarte un Beso (Official Video) Mtau4v6foHA
+063-Ellie Goulding - Burn (Official Video) CGyEd0aKWZE
+064-Calvin Harris & Disciples - How Deep Is Your Love EgqUJOudrcM
+065-Carlos Vives, Shakira - La Bicicleta -UV0QGLmYys
+066-Taylor Swift - Bad Blood ft. Kendrick Lamar QcIy9NiNbmo
+067-Mike Posner - I Took A Pill In Ibiza (Seeb Remix) (Explicit) foE1mO2yM04
+068-Sam Smith - I'm Not The Only One (Official Video) nCkpzqqog4k
+069-Rag'n'Bone Man - Human (Official Video) L3wKzyIN1yk
+070-Carly Rae Jepsen - Call Me Maybe fWNaR-rxAic
+071-Shawn Mendes - Stitches (Official Video) VbfpW0pbvaU
+072-Wisin - Escápate Conmigo (Official Video) ft. Ozuna 3X9wEwulYhk
+073-Post Malone - Congratulations ft. Quavo SC4xMk98Pdc
+074-Nirvana - Smells Like Teen Spirit (Official Music Video) hTWKbfoikeg
+075-Gente de Zona - La Gozadera (Official Video) ft. Marc Anthony VMp55KH_3wo
+076-Katy Perry - Last Friday Night (T.G.I.F.) (Official Music Video) KlyXNRrsk4A
+077-P!nk - Just Give Me A Reason ft. Nate Ruess OpQFFLBMEPI
+078-Katy Perry - Firework (Official Music Video) QGJuMBdaqIw
+079-Imagine Dragons - Radioactive ktvTqknDobU
+080-Pitbull - Timber (Official Video) ft. Ke$ha hHUbLv4ThOo
+081-French Montana - Unforgettable ft. Swae Lee CTFtOOh47oo
+082-50 Cent - In Da Club (Official Music Video) 5qm8PH4xAss
+083-Guns N' Roses - Sweet Child O' Mine (Official Music Video) 1w7OgIMMRc4
+084-One Direction - What Makes You Beautiful (Official Video) QJO3ROT-A4E
+085-Ariana Grande ft. Iggy Azalea - Problem (Official Video) iS1g8G_njx8
+086-Sam Smith - Too Good At Goodbyes (Official Video) J_ub7Etch2U
+087-AronChupa - I'm an Albatraoz _ OFFICIAL VIDEO Bznxx12Ptl0
+088-Taylor Swift - Look What You Made Me Do 3tmd-ClpJxA
+089-Chris Jedi - Ahora Dice (Official Video) ft. J. Balvin, Ozuna, Arcángel c73Cu3TQnlg
+090-Joey Montana - Picky RqpKDkVzlqU
+091-Eminem - Without Me (Official Music Video) YVkUvmDQ3HY
+092-Prince Royce - Darte un Beso bdOXnTbyk0g
+093-Taylor Swift - You Belong With Me VuNIsY6JdUw
+094-Eminem - Rap God (Explicit) XbGs_qK2PQA
+095-Don Omar - Danza Kuduro ft. Lucenzo 7zp1TbLFPp8
+096-Maluma - El Perdedor (Official Video) PJniSb91tvo
+097-Rihanna - Work (Explicit) ft. Drake HL1UzIK-flA
+098-Ricky Martin - La Mordidita (Official Video) ft. Yotuel lBztnahrOFw
+099-Beyoncé - Halo bnVUHWCynig
+100-The Weeknd - Can't Feel My Face (Official Video) KEI4qSrkPAs
+101-Shakira - La La La (Brazil 2014) ft. Carlinhos Brown 7-7knsP2n5w
+102-Sia - Elastic Heart feat. Shia LaBeouf & Maddie Ziegler (Official Video) KWZGAExj-es
+103-Katy Perry - Bon Appétit (Official) ft. Migos dPI-mRFEIH0
+104-The Cranberries - Zombie (Official Music Video) 6Ejga4kJUts
+105-Shakira - Can't Remember to Forget You (Official Video) ft. Rihanna o3mP3mJDL2k
+106-Daddy Yankee - Limbo (Video Oficial) 6BTjG-dhf5s
+107-Whitney Houston - I Will Always Love You (Official 4K Video) 3JWTaaS7LdU
+108-Miley Cyrus - Wrecking Ball (Official Video) My2FRPA3Gf8
+109-Chris Brown - Loyal (Official Video) ft. Lil Wayne, Tyga JXRN_LkCa_o
+110-Pitbull - Rain Over Me ft. Marc Anthony SmM0653YvXU
+111-Enrique Iglesias - El Perdedor (Pop) ft. Marco Antonio Solís tLcfAnN2QgY
+112-J Balvin - 6 AM ft. Farruko (Official Video) yUV9JwiQLog
+113-System Of A Down - Chop Suey! (Official HD Video) CSvFpBOe8eY
+114-Naughty Boy - La la la ft. Sam Smith (Official Video) 3O1_3zBUKM8
+115-Rick Astley - Never Gonna Give You Up (Official Music Video) dQw4w9WgXcQ
+116-Ariana Grande - Break Free ft. Zedd L8eRzOYhLuw
+117-Sam Smith - Stay With Me (Official Video) pB-5XG-DbAA
+118-Michael Jackson - Billie Jean (Official Video) Zi_XLOBDo_Y
+119-Nelly - Dilemma (Official Music Video) ft. Kelly Rowland 8WYHDfJDPDc
+120-ZAYN - PILLOWTALK (Official Music Video) C_3d6GntKbk
+121-DJ Snake, Lil Jon - Turn Down for What HMUDVMiITOU
+122-Katy Perry - Hot N Cold (Official) kTHNpusq654
+123-Iggy Azalea - Fancy ft. Charli XCX (Official Music Video) O-zpOMYRi0w
+124-Bon Jovi - It's My Life (Official Music Video) vx2u5uUu3DE
+125-Chino & Nacho - Me Voy Enamorando ft. Farruko (Remix) (Official Music Video) 0yr75-gxVtM
+126-Marc Anthony - Vivir Mi Vida (Official Video) YXnjy5YlDwk
+127-Justin Bieber - Never Say Never (Official Music Video) ft. Jaden Smith _Z5-P9v3F8w
+128-Shawn Mendes - There's Nothing Holdin' Me Back dT2owtxkU8k
+129-Enrique Iglesias - DUELE EL CORAZON ft. Wisin xFutjZEBTXs
+130-DJ Khaled - Wild Thoughts (Official Video) ft. Rihanna, Bryson Tiller fyaI4-5849w
+131-Maluma - Sin Contrato (Official Video) 9xByMBYDRmY
+132-Nicki Minaj - Anaconda LDZX4ooRsWs
+133-Maluma - Borro Cassette (Official Video) Xk0wdDTTPA0
+134-AC_DC - Thunderstruck (Official Video) v2AC41dglnM
+135-Romeo Santos - Eres Mía 8iPcqtHoR3U
+136-Backstreet Boys - I Want It That Way (Official HD Video) 4fndeDfaWCg
+137-Shakira - Hips Don't Lie (Official 4K Video) ft. Wyclef Jean DUT5rEU6pqM
+138-Camila Cabello - Havana ft. Young Thug BQ0mxQXmLsk
+139-Rihanna - We Found Love ft. Calvin Harris tg00YEETFzg
+140-J Balvin - Safari ft. Pharrell Williams, BIA, Sky (Official Video) JWESLtAKKlU
+141-Rihanna - Stay ft. Mikky Ekko JF8BRvqGCNs
+142-Maluma - Cuatro Babys (Official Video) ft. Trap Capos, Noriel, Bryant Myers, Juhn OXq-JP8w5H4
+143-Cyndi Lauper - Girls Just Want To Have Fun (Official Video) PIb6AZdTr-A
+144-Evanescence - Bring Me To Life (Official Music Video) 3YxaaGgTQYM
+145-Justin Bieber - Beauty And A Beat ft. Nicki Minaj (Official Music Video) Ys7-6_t7OEQ
+146-One Direction - Drag Me Down (Official Video) Jwgf3wmiA04
+147-Auli'i Cravalho - How Far I'll Go (from Moana_Official Video) cPAbx5kgCJo
+148-Aqua - Barbie Girl (Official Music Video) ZyhrYis509A
+149-Dr. Dre ft. Snoop Dogg - Still D.R.E. (Official Video) _CL6n0FJZpk
+150-Justin Timberlake - Mirrors (Official Video) uuZE_IRwLNI
+151-Katy Perry - Wide Awake (Official Video) k0BWlvnBmIE
+152-J Balvin - Si Tu Novio Te Deja Sola ft. Bad Bunny (Official Video) Km4BayZykwE
+153-Maroon 5 - One More Night (Official Music Video) fwK7ggA3-bU
+154-Imagine Dragons - Demons (Official Video) mWRsgZuwf_8
+155-Ariana Grande - Focus lf_wVfwpfp8
+156-Europe - The Final Countdown (Official Video) 9jK-NcRmVcw
+157-Lady Gaga - Poker Face (Official Music Video) bESGLojNYSo
+158-Post Malone - rockstar ft. 21 Savage UceaB4D0jpo
+159-Ayo & Teo - Rolex (Official Video) lwk5OUII9Vc
+160-Thalia - Desde Esa Noche (Premio Lo Nuestro 2016) ft. Maluma 6C_s56iscpQ
+161-Cali Y El Dandee - Por Fin Te Encontré ft. Juan Magan, Sebastian Yatra (Video Oficiel) _kxz7WX4mLU
+162-One Direction - Story of My Life W-TE_Ys4iwM
+163-Miley Cyrus - We Can't Stop (Official Video) LrUvu1mlWco
+164-Mike WiLL Made-It - 23 ft. Miley Cyrus, Wiz Khalifa, Juicy J (Official Music Video) bbEoRnaOIbs
+165-Scorpions - Wind Of Change (Official Music Video) n4RjJKxsamQ
+166-Nicki Minaj - Super Bass 4JipHEz53sU
+167-Karol G, Bad Bunny - Ahora Me Llama (Official Video) 4NNRy_Wz16k
+168-Tove Lo - Habits (Stay High) - Hippie Sabotage Remix SYM-RJwSGQ8
+169-Harry Styles - Sign of the Times (Official Video) qN4ooNx77u0
+170-The Police - Every Breath You Take (Official Video) OMOGaugKpzs
+171-Avicii - Waiting For Love cHHLHGNpCSA
+172-Ariana Grande - Into You (Official Video) 1ekZEVeXwek
+173-will.i.am - Scream & Shout ft. Britney Spears (Official Music Video) kYtGl1dX5qI
+174-Rihanna - What's My Name (Official Music Video) ft. Drake U0CGsw6h60k
+175-Katy Perry - Part Of Me (Official) uuwfgXD8qV8
+176-Pitbull - Give Me Everything ft. Ne-Yo, Afrojack, Nayer EPo5wWmKEaI
+177-Audioslave - Like a Stone (Official Video) 7QU1nvuxaMA
+178-HA-ASH - Perdón, Perdón (Primera Fila - Hecho Realidad [En Vivo]) _wL3Pc-EmjA
+179-Katy Perry - The One That Got Away (Official Music Video) Ahha3Cqe_fk
+180-Nacho, Yandel, Bad Bunny - Báilame (Remix) T7VewKI44rQ
+181-Sean Kingston - Beautiful Girls MrTz5xjmso4
+182-LMFAO - Sexy and I Know It (Official Video) wyx6JDQCslE
+183-Eminem - When I'm Gone (Official Music Video) 1wYNFfgrXTI
+184-Michael Jackson - They Don’t Care About Us (Brazil Version) (Official Video) QNJL6nfu__Q
+185-Lorde - Royals (US Version) nlcIKh6sBtc
+186-R. City - Locked Away ft. Adam Levine 6GUm5g8SG4o
+187-John Newman - Love Me Again CfihYWRWRTQ
+188-No Doubt - Don't Speak (Official 4K Music Video) TR3Vdo5etCQ
+189-Bon Jovi - Livin' On A Prayer (Official Music Video) lDK9QqIzhwk
+190-Stromae - Papaoutai oiKj0Z_Xnjc
+191-Rae Sremmurd - Black Beatles ft. Gucci Mane (Official Video) b8m9zhNAgKs
+192-Little Mix - Black Magic (Official Video) MkElfR_NPBI
+193-Shakira - Perro Fiel (Official Video) ft. Nicky Jam SHq2qrFUlGY
+194-Wisin - Adrenalina (Official Video) ft. Jennifer Lopez, Ricky Martin ME2Hufquz0k
+195-Beyoncé - Single Ladies (Put a Ring on It) (Video Version) 4m1EFMoRFvY
+196-Bonnie Tyler - Total Eclipse of the Heart (Video) lcOxhH8N3Bo
+197-Rihanna - Only Girl (In The World) (Official Music Video) pa14VNsdSYM
+198-Miley Cyrus - Party In The U.S.A. (Official Video) M11SvDtPBhA
+199-Rae Sremmurd - No Type (Official Video) wzMrK-aGCug
+200-J. Balvin - Ginza (Official Video) zZjSX01P5dE
+201-Justin Bieber - Boyfriend (Official Music Video) 4GuqB1BQVr4
+202-Akon - Smack That (Official Music Video) ft. Eminem bKDdT_nyP54
+203-Rihanna - Man Down sEhy-RXkNo0
+204-Indila - Dernière Danse (Clip Officiel) K5KAc5CoCuk
+205-Hoobastank - The Reason (Official Music Video) fV4DiAyExN0
+206-Kendrick Lamar - HUMBLE. tvTRZJ-4EyI
+207-Foster The People - Pumped Up Kicks (Official Video) SDTZ7iX4vTQ
+208-Khalid - Young Dumb & Broke (Official Video) IPfJnp1guPc
+209-Michael Jackson - Thriller (Official Video) sOnqjkJTMaA
+210-Pitbull - International Love (Official Video) ft. Chris Brown CdXesX6mYUE
+211-Calvin Harris - I Need Your Love (Official Video) ft. Ellie Goulding AtKZKl7Bgu0
+212-Eminem ft. Rihanna - The Monster (Explicit) [Official Video] EHkozMIXZ8w
+213-Evanescence - My Immortal (Official Music Video) 5anLPw0Efmo
+214-Swedish House Mafia ft. John Martin - Don't You Worry Child (Official Video) 1y6smkh6c-0
+215-George Michael - Careless Whisper (Official Video) izGwDsrQ1eQ
+216-Jennifer Lopez - Ain't Your Mama (Official Video) Pgmx7z49OEk
+217-Shakira - Me Enamoré (Official Video) sPTn0QEhxds
+218-We Are One (Ole Ola) [The Official 2014 FIFA World Cup Song] (Olodum Mix) TGtWWb9emYI
+219-AC_DC - Back In Black (Official Video) pAgnJDJN4VA
+220-Avicii - The Nights UtF6Jej8yb4
+221-La Adictiva Banda San José de Mesillas - Después de Ti, ¿Quién (Video Oficial) YWu9mB6X9Oc
+222-Kygo - Firestone ft. Conrad Sewell (Official Video) 9Sc-ir2UwGU
+223-Taylor Swift - Wildest Dreams IdneKLhsWOQ
+224-Bon Jovi - Always (Official Music Video) 9BMwcO6_hyA
+225-Maroon 5 - Animals (Official Music Video) qpgTC9MDx1o
+226-Farruko - Chillax ft. Ky-Mani Marley (Official Video) 7fEQmJ98x_Y
+227-Michael Jackson - Beat It (Official Video) oRdxUFDoQe0
+228-Bobby Shmurda - Hot N_gga (Official Music Video) vJwKKKd2ZYE
+229-Adele - Send My Love (To Your New Lover) fk4BbF7B29w
+230-Robin Thicke - Blurred Lines ft. T.I., Pharrell (Official Music Video) yyDUC1LUXSU
+231-Calvin Harris - Blame ft. John Newman 6ACl8s_tBzE
+232-Jessie J - Price Tag ft. B.o.B qMxX-QOV9tI
+233-Katy Perry - This Is How We Do (Official) 7RMQksXpQSk
+234-Don Omar - Taboo lRWqYR3e7xE
+235-Romeo Santos - Yo También (Official Video) ft. Marc Anthony QBaIMZ8QjcU
+236-Alvaro Soler - Sofia (Official Music Video) qaZ0oAh4evU
+237-Rihanna - Umbrella (Orange Version) (Official Music Video) ft. JAY-Z CvBfHwUxHIk
+238-Farruko, Bad Bunny, Rvssian - Krippy Kush (Official Video) j1_JW7An2l0
+239-Selena Gomez - The Heart Wants What It Wants (Official Video) ij_0p_6qTss
+240-Enrique Iglesias, Juan Luis Guerra - Cuando Me Enamoro (Official Music Video) 4DO8GsIYfhQ
+241-Zara Larsson - Lush Life tD4HCZe-tew
+242-The Verve - Bitter Sweet Symphony (Official Music Video) 1lyu1KKwC74
+243-The Black Eyed Peas - Where Is The Love (Official Music Video) WpYeekQkAdc
+244-One Direction - Best Song Ever o_v9MY_FMcw
+245-Maroon 5 - Moves Like Jagger ft. Christina Aguilera (Official Music Video) iEPTlhBmwRg
+246-Sia - The Greatest GKSRyLdjsPA
+247-Akon - Lonely (Official Music Video) 6EEW-9NDM5k
+248-Ariana Grande, The Weeknd - Love Me Harder (Official Video) g5qU7p7yOY8
+249-50 Cent - Candy Shop (Official Music Video) ft. Olivia SRcnnId15BA
+250-Selena Gomez - Come & Get It n-D1EB74Ckg
+251-Meghan Trainor - Like I'm Gonna Lose You (Official Video) ft. John Legend 2-MBfn8XjIU
+252-Jonas Blue - Mama ft. William Singe (Official Video) qPTfXwPf_HM
+253-One Direction - One Thing Y1xs_xPb46M
+254-Mariah Carey - All I Want For Christmas Is You (Official Video) yXQViqx6GMY
+255-Jonas Blue - Perfect Strangers ft. JP Cooper (Official Video) Ey_hgKCCYU4
+256-Maroon 5 - Payphone ft. Wiz Khalifa (Explicit) (Official Music Video) KRaWnd3LJfs
+257-Simone & Simaria - Loka (Ao Vivo) ft. Anitta UrT0zCmsN8c
+258-Future - Low Life (Official Music Video) ft. The Weeknd K_9tX4eHztY
+259-Silvestre Dangond, Nicky Jam - Cásate Conmigo (Official Video) cpN78ZjnCZY
+261-One Direction - Live While We're Young AbPED9bisSc
+262-Lil Wayne - Mirror ft. Bruno Mars (Official Music Video) OZLUa8JUR18
+263-Katy Perry - Chained To The Rhythm (Official) ft. Skip Marley Um7pMggPnug
+264-Justin Bieber - One Time (Official Music Video) CHVhwcOg6y8
+265-Dillon Francis, DJ Snake - Get Low (Official Music Video) 12CeaxLiMgE
+266-The Weeknd - Earned It (from Fifty Shades Of Grey) (Official Video - Explicit) waU75jdUnYw
+267-Taylor Swift - Style -CmadmM5cOk
+268-Adele - Set Fire To The Rain (Live at The Royal Albert Hall) Ri7-vnrJD3k
+269-Wham! - Last Christmas (Official Video) E8gmARGvPlI
+270-3 Doors Down - Here Without You (Official Music Video) kPBzTxZQG5Q
+271-Shakira - Try Everything (Official Video) c6rP-YP4c5I
+272-Guns N' Roses - Paradise City Rbm6GXllBiw
+273-MC Hammer - U Can't Touch This (Official Music Video) otCpCn0l4Wo
+274-Taylor Swift - We Are Never Ever Getting Back Together WA4iX5D9Z64
+275-The Black Eyed Peas - Pump It (Official Music Video) ZaI2IlHwmgQ
+276-Sia - Cheap Thrills (Performance Edit) 31crA53Dgu0
+277-Nelly Furtado - Say It Right (Official Music Video) 6JnGBs88sL0
+278-Britney Spears - ...Baby One More Time (Official Video) C-u5WLJ9Yk4
+279-Banda Los Recoditos - Mi Último Deseo (Video Oficial) cVlAmP-KDT4
+280-Jessie J - Flashlight (from Pitch Perfect 2) (Official Video) DzwkcbTQ7ZE
+282-Demi Lovato - Heart Attack (Official Video) AByfaYcOm4A
+283-Meghan Trainor - Me Too qDRORgoZxZU
+284-Guns N' Roses - Don't Cry zRIbf6JqkNc
+285-Bastille - Pompeii (Official Music Video) F90Cw4l-8NY
+286-Akon - Right Now (Na Na Na) (Official Video) vIaH35-MLsk
+287-Katy Perry - Swish Swish (Official) ft. Nicki Minaj iGk5fR-t5AU
+288-Shakira - La Tortura (Official HD Video) ft. Alejandro Sanz Dsp_8Lm1eSk
+289-ZAYN, Taylor Swift - I Don’t Wanna Live Forever (Fifty Shades Darker) 7F37r50VUTQ
+290-Ariana Grande - Dangerous Woman 9WbCfHutDSE
+291-Pitbull - Feel This Moment (Official Video) ft. Christina Aguilera 5jlI4uzZGjU
+292-Selena Gomez & The Scene - Love You Like A Love Song EgT_us6AsDg
+293-Wisin, Carlos Vives - Nota de Amor (Official Video) ft. Daddy Yankee wZRWpr1G1Qw
+294-Beyoncé - Drunk in Love (Explicit) ft. JAY Z p1JPKLa-Ofc
+295-Romeo Santos - La Diabla_Mi Santa ft. Tomatito Hz9lhqxl_gQ
+296-Maroon 5 - She Will Be Loved (Official Music Video) nIjVuRTm-dc
+297-The Black Eyed Peas - My Humps iEe_eraFWWs
+298-Duke Dumont - Ocean Drive (Official Music Video) KDxJlW6cxRk
+299-Iggy Azalea - Black Widow ft. Rita Ora (Official Music Video) u3u22OYqFGo
+300-Justin Bieber - Company (Official Music Video) gdx7gN1UyX0
+301-Anna Kendrick - Cups (Pitch Perfect’s “When I’m Gone”) [Official Video] cmSbXsFE3l8
+302-J Balvin - Bobo (Official Video) 0GvLP2C2w9U
+303-Alicia Keys - No One (Official Music Video) rywUS-ohqeE
+304-Adele - When We Were Young (Live at The Church Studios) DDWKuo3gXMQ
+305-Waka Waka (Esto es Africa) (Cancion Oficial de la Copa Mundial de la FIFA� Sudafrica 2010) dzsuE5ugxf4
+306-Katy Perry - California Gurls (Official Music Video) ft. Snoop Dogg F57P9C4SAW4
+307-The Pussycat Dolls - Buttons (Official Music Video) ft. Snoop Dogg VCLxJd1d84s
+308-Taylor Swift - Love Story 8xg3vE8Ie_E
+309-Demi Lovato - Let It Go (from 'Frozen') (Official Video) kHue-HaXXzg
+311-Calvin Harris - Outside (Official Video) ft. Ellie Goulding J9NQFACZYEU
+312-Shakira - Whenever, Wherever (Official HD Video) weRHyjj34ZE
+313-Maroon 5 - What Lovers Do ft. SZA (Official Music Video) 5Wiio4KoGe8
+314-Wisin & Yandel - Follow The Leader ft. Jennifer Lopez Xmap94TcDNs
+315-Enrique Iglesias - Loco ft. Romeo Santos RSyUWjftHrs
+316-Toni Braxton - Un-Break My Heart (Official HD Video) p2Rch6WvPJE
+317-Daddy Yankee - Sígueme y Te Sigo (Video Oficial) EfF9EE6ZR5E
+318-Ke$ha - TiK ToK (Official HD Video) iP6XpLQM2Cs
+319-Katy Perry - E.T. ft. Kanye West (Official Music Video) t5Sd5c4o9UM
+320-Meghan Trainor - NO cMTAUr3Nm6I
+321-Katy Perry - Unconditionally (Official) XjwZAa2EjKA
+322-Taylor Swift - 22 AgFeZr5ptV8
+323-Roxette - It Must Have Been Love (Official Music Video) k2C5TjS2sh4
+324-Capital Cities - Safe And Sound (Official Music Video) 47dtFZ8CFo8
+325-Shakira - Loca (Spanish Version) ft. El Cata XAhTt60W7qo
+326-Wisin & Yandel - Algo Me Gusta De Ti ft. Chris Brown, T-Pain 3rgwIp6D3ow
+327-Rihanna - Rude Boy (Official Music Video) e82VE8UtW8A
+328-Beyoncé - Crazy In Love ft. JAY Z ViwtNLUqkMY
+329-Rihanna - Don't Stop The Music yd8jh9QYfEs
+330-Meghan Trainor - Dear Future Husband (Official Video) ShlW5plD_40
+331-Eminem - Mockingbird (Official Music Video) S9bCLPwzSC0
+332-A Great Big World, Christina Aguilera - Say Something -2U0Ivkn2Ds
+333-Nelly - Just A Dream (Official Music Video) N6O2ncUKvlg
+334-Meghan Trainor - Lips Are Movin (Official Music Video) qDc_5zpBj7s
+335-Avril Lavigne - Girlfriend Bg59q4puhmg
+336-Rihanna - Take A Bow (Official Music Video) J3UjJ4wKLkg
+337-The Black Eyed Peas - I Gotta Feeling (Official Music Video) uSD4vsh1zDA
+338-Rihanna - Where Have You Been HBxt_v0WF6Y
+339-Avicii - Levels _ovdm2yX4MA
+340-Eminem - No Love (Explicit Version) ft. Lil Wayne KV2ssT8lzj8
+341-Rachel Platten - Fight Song (Official Video) xo1VInw-SKc
+342-LMFAO - Sorry For Party Rocking SkTt9k4Y-a8
+343-Abba - Dancing Queen (Official Music Video Remastered) xFrGuyw1V8s
+344-The Black Eyed Peas - The Time (Dirty Bit) (Official Music Video) JwQZQygg3Lk
+345-Plan B - Mi Vecinita SB8-YY2DyHI
+346-One Direction - You & I _kqQDCxRCzM
+347-Jennifer Lopez - Dance Again (Official Video) ft. Pitbull bjgFH01k0gU
+348-Britney Spears - Toxic (Official HD Video) LOZuxwVk7TU
+349-Chris Brown - With You (Official Video) nmjdaBaZe8Y
+351-G-Eazy x Bebe Rexha - Me, Myself & I bSfpSOBD30U
+352-Chris Brown - Look at Me Now (Official Video) ft. Lil Wayne, Busta Rhymes 8gyLR4NfMiI
+353-Beyoncé - If I Were A Boy AWpsOqh8q0M
+354-Beyoncé - Run the World (Girls) (Official Video) VBmMU_iwe6U
+355-Rich Gang ft. Young Thug, Rich Homie Quan - Lifestyle (Official Video) nGt_JGHYEO4
+356-One Direction - Kiss You (Official) T4cdfRohhcg
+357-Pia Mia - Do It Again ft. Chris Brown, Tyga (Official Music Video) cNw8A5pwbVI
+358-HA-ASH - Te Dejo En Libertad (En Vivo) ZxvI1epOAWE
+359-Maître Gims - Bella (Clip officiel) rMltoD1jCGI
+360-Selena Gomez - Kill Em With Kindness HHP5MKgK0o8
+361-AC_DC - Highway to Hell (Live At River Plate, December 2009) gEPmA3USJdI
+362-Imagine Dragons - It's Time sENM2wA_FTg
+363-Lana Del Rey - Born To Die (Official Music Video) Bag1gUxuU0g
+364-Sean Kingston, Justin Bieber - Eenie Meenie (Video Version) prmmCg5bKxA
+365-Alejandro Fernández - Hoy Tengo Ganas De Ti ft. Christina Aguilera (Video Oficial) Z81hsLIY1sQ
+366-Prince Royce, Shakira - Deja vu (Official Video) XEvKn-QgAY0
+367-Camila - Mientes (Video) xftFxCYQTdk
+368-Dr. Dre ft. Snoop Dogg, Kurupt, Nate Dogg - The Next Episode (Official Video) QZXc39hT8t4
+369-Soulja Boy Tell'em - Crank That (Soulja Boy) (Official Music Video) 8UFIYGkROII
+370-The Script - Hall of Fame (Official Video) ft. will.i.am mk48xRzuNvA
+371-Zara Larsson, MNEK - Never Forget You GTyN-DB_v5M
+372-Mariah Carey - We Belong Together (Official Music Video) 0habxsuXW4g
+374-Mr. Probz - Waves (Robin Schulz Remix Radio Edit) pUjE9H8QlA4
+375-P!nk - Try (Official Video) yTCDVfMz15M
+376-Justin Bieber - I'll Show You (Official Music Video) PfGaX8G0f2E
+377-One Direction - Perfect (Official Video) Ho32Oh6b4jc
+378-Selena Gomez - Good For You 1TsVjvEkc4s
+379-Demi Lovato - Sorry Not Sorry (Official Video) -MsvER1dpjM
+380-Plan B - Fanatica Sensual Official Video QvypZSdjO8M
+381-Eminem - Stan (Long Version) ft. Dido gOMhN-hfMtY
+382-Nicki Minaj - Only ft. Drake, Lil Wayne, Chris Brown zXtsGAkyeIo
+383-Foo Fighters - The Pretender SBjQ9tuuTJQ
+384-Taylor Swift - I Knew You Were Trouble vNoKguSdy4Y
+385-Drake - Started From The Bottom (Explicit) RubBzkZzpUA
+386-Rihanna, Kanye West, Paul McCartney - FourFiveSeconds kt0g4dWxEBo
+387-Miley Cyrus - Malibu (Official Video) 8j9zMok6two
+388-Lady Antebellum - Need You Now eM213aMKTHg
+389-Beyoncé - Love On Top (Official Video) Ob7vObnFUJc
+390-Guns N' Roses - Welcome To The Jungle o1tj2zJ2Wvg
+391-Tyga - Hookah (Official Music Video) ft. Young Thug b-J95fYuVz4
+392-Justin Bieber - As Long As You Love Me ft. Big Sean (Official Music Video) R4em3LKQCAQ
+393-J Balvin - Tranquila (Official Video) HWyEEj2pSt0
+394-One Direction - Night Changes syFZfO_wfMQ
+395-Farruko - Passion Whine ft. Sean Paul (Official Video) MNmc_XJp5rI
+396-Lady Gaga - Alejandro niqrrmev4mA
+397-Justin Bieber - Somebody To Love Remix ft. Usher (Official Music Video) SOI4OF7iIr4
+398-J Balvin - Sigo Extrañándote (Official Video) nZ0zbsZOdwg
+399-Avril Lavigne - When You're Gone (Official Video) 0G3_kG5FFfQ
+400-Desiigner - Panda (Official Music Video) E5ONTXHS2mM
+402-The Calling - Wherever You Will Go (Official Video) iAP9AF6DCu4
+403-Nego do Borel - Você Partiu Meu Coração ft. Anitta, Wesley Safadão (Video Oficial) Xp-dKdSUuLk
+404-Louis Tomlinson - Back to You (Official Video) ft. Bebe Rexha, Digital Farm Animals -HjpL-Ns6_A
+405-Maroon 5 - Maps (Explicit) (Official Music Video) NmugSMBh_iI
+406-The Weeknd - Often (NSFW) (Official Video) JPIhUaONiLU
+407-Nicki Minaj - Right By My Side (Explicit) ft. Chris Brown he3DJLXbebI
+408-Cali Y El Dandee - Yo Te Esperaré _KSyWS8UgA4
+409-Lana Del Rey - Young and Beautiful (Official Music Video) o_1aF54DO60
+411-The Killers - Mr. Brightside (Official Music Video) gGdGFtwCNBE
+412-One Direction - One Way Or Another (Teenage Kicks) 36mCEZzzQ3o
+413-Lil Wayne - Lollipop ft. Static (Official Music Video) 2IH8tNQAzSs
+415-Sam Smith - Lay Me Down (Official Video) HaMq2nn5ac0
+416-Kungs vs Cookin’ on 3 Burners - This Girl (Official Music Video) 2Y6Nne8RvaA
+417-Becky G - Shower 50-_oTkmF5I
+418-Jennifer Lopez - Papi (Official Video) 6XbIuSLaCnk
+419-Selena Gomez - Same Old Love 9h30Bx4Klxg
+420-Justin Bieber - Mistletoe (Official Music Video) LUjn3RpkcKY
+421-Dr. Dre ft. Eminem, Skylar Grey - I Need A Doctor (Explicit) [Official Video] VA770wpLX-Q
+422-Akon - Don't Matter (Official Music Video) JWA5hJl4Dv0
+423-Kelly Clarkson - Because Of You (VIDEO) Ra-Om7UMSJc
+424-DNCE - Cake By The Ocean vWaRiD5ym74
+425-Fifth Harmony - All In My Head (Flex) (Official Video) ft. Fetty Wap jsbeemdD2rQ
+426-Timbaland - Apologize ft. OneRepublic ZSM3w1v-A_Y
+427-Beyoncé, Shakira - Beautiful Liar QrOe2h9RtWI
+428-Demi Lovato - Give Your Heart a Break (Official Video) 1zfzka5VwRc
+429-Ariana Grande - The Way ft. Mac Miller _sV0S8qWSy0
+430-Beyoncé - Irreplaceable 2EwViQxSJJQ
+431-Logic - 1-800-273-8255 ft. Alessia Cara, Khalid (Official Video) Kb24RrHIbFk
+432-50 Cent - 21 Questions (Official Music Video) ft. Nate Dogg cDMhlvbOFaM
+433-Enrique Iglesias - Hero (Official Music Video) koJlIGDImiU
+434-Miley Cyrus - The Climb NG2zyeVRcbs
+435-Owl City - Fireflies (Official Music Video) psuRGfAaju4
+436-Thalia - Equivocada (Live Version) QPeNUfc8hGk
+437-Avril Lavigne - Wish You Were Here (Video) VT1-sitWRtY
+439-Eminem - Beautiful (Official Music Video) lgT1AidzRWM
+440-Christina Aguilera, Lil' Kim, Mya, Pink - Lady Marmalade (Official Music Video) RQa7SvVCdZk
+441-Rihanna - Pour It Up (Explicit) ehcVomMexkY
+442-Rihanna - Unfaithful (Official Music Video) rp4UwPZfRis
+443-J. Balvin, Jowell & Randy - Bonita (Official Video) SqpvOqRieYY
+444-Selena Gomez - Hands To Myself FMlcn-_jpWY
+445-One Direction - History (Official Video) yjmp8CoZBIo
+446-Calvin Harris - My Way (Official Video) b4Bj7Zb-YD4
+447-Nicki Minaj - Starships (Explicit) SeIJmciN8mo
+448-Reik - Creo en Ti snFhcHHdzT0
+449-Kings Of Leon - Sex on Fire (Official Video) RF0HhrwIwp0
+450-Justin Bieber - Love Me (Official Music Video) qdDVtFvJwUc
+451-The Black Eyed Peas - Boom Boom Pow (Official Music Video) 4m48GqaOz90
+452-Justin Timberlake - Cry Me A River (Official Video) DksSPZTZES0
+453-Lady Gaga - Telephone ft. Beyoncé (Official Music Video) EVBsypHzF3U
+454-Eminem - Like Toy Soldiers (Official Video) lexLAjh8fPA
+455-Naughty Boy ft. Beyoncé, Arrow Benjamin - Runnin' (Lose It All) [Official Video] eJSik6ejkr0
+456-Lil Wayne - Love Me ft. Drake, Future (Explicit) (Official Music Video) KY44zvhWhp4
+457-Kelly Clarkson - Stronger (What Doesn't Kill You) [Official Video] Xn676-fLq7I
+458-Descendants Cast - Rotten to the Core (from Descendants) (Official Video) zGlLe1w3DJM
+459-P!nk - So What (Official Video) FJfFZqTlWrQ
+460-Timbaland - The Way I Are (Official Music Video) ft. Keri Hilson, D.O.E., Sebastian U5rLz5AZBIA
+461-Vanilla Ice - Ice Ice Baby (Official Music Video) rog8ou-ZepE
+462-Bryson Tiller - Don't (Explicit Version) d7cVLE4SaN0
+463-Michael Jackson - The Way You Make Me Feel (Official Video) HzZ_urpj4As
+464-Machine Gun Kelly, Camila Cabello - Bad Things (Official Music Video) QpbQ4I3Eidg
+465-Eminem - You Don't Know (Official Music Video) ft. 50 Cent, Cashis, Lloyd Banks ngH0fkiNo-g
+467-Kanye West - Stronger PsO6ZnUZI0g
+468-Bloodhound Gang - The Bad Touch (Official Video) xat1GVnl8-k
+469-What Goes Around...Comes Around (Official Video) TOrnUquxtwA
+470-Reyli Barba - Amor del Bueno (Video) FUinZg5MC5U
+471-Owl City & Carly Rae Jepsen - Good Time (Official Video) H7HmzwI67ec
+472-Plan B - Candy 9FWgcBfs5A0
+473-The Black Eyed Peas - Meet Me Halfway (Official Music Video) I7HahVwYpwo
+474-Lady Gaga - Judas (Official Music Video) wagn8Wrmzuc
+475-Justin Bieber - One Less Lonely Girl (Official Music Video) LXUSaVw3Mvk
+476-Lady Gaga - Applause (Official Music Video) pco91kroVgQ
+477-Rihanna - Rehab (Official Music Video) ft. Justin Timberlake rJYcmq__nDM
+478-Ricardo Montaner - La Gloria de Dios (Video Oficial) ft. Evaluna Montaner LRsgqFu5c1o
+479-Maître Gims - Est-ce que tu m'aimes (Clip officiel) 6TpyRE_juyA
+480-Michael Jackson - Bad (Shortened Version) dsUXAEzaC3Q
+481-Beyoncé - Best Thing I Never Had (Video) FHp2KgyQUFk
+482-Shawn Mendes, Camila Cabello - I Know What You Did Last Summer (Official Video) ngORmvyvAaI
+483-Drake - Take Care ft. Rihanna -zzP29emgpg
+484-One Direction - Steal My Girl UpsKGvPjAgw
+485-Selena Gomez - Slow Down (Official) Z8eXaXoUJRQ
+486-Jennifer Lopez - Booty ft. Iggy Azalea (Official Video) nxtIRArhVD4
+487-Demi Lovato - Cool for the Summer (Official Video) il9nqWw9W3Y
+488-Tove Lo - Habits (Stay High) oh2LWWORoiM
+489-WALK THE MOON - Shut Up and Dance (Official Video) 6JCLY0Rlx6Q
+490-One Direction - Little Things xGPeNN9S0Fg
+491-Big Sean - I Don't Fuck With You (Official Music Video) ft. E-40 cZaJYDPY-YQ
+492-Enrique Iglesias - No Me Digas Que No (Official Music Video) ft. Wisin & Yandel zyqt2avPkoA
+494-Taylor Swift - Everything Has Changed ft. Ed Sheeran w1oM3kQpXRo
+495-Britney Spears - Work B_ch (Official Music Video) pt8VYOfr8To
+496-Nacho - Bailame a1J44C-PZ3E
+497-Axel - Te Voy A Amar KZh60U1PqSE
+498-Route 94 - My Love (Official Video) ft. Jess Glynne BS46C2z5lVE
+499-Kendji Girac - Andalouse (Clip Officiel) FndmvPkI1Ms
+500-Little Mix - Touch (Official Video) gBAfejjUQoA
+501-Iggy Azalea - Work (Official Music Video) _zR6ROjoOX0
+502-Wisin & Yandel - Estoy Enamorado whBcmlaSLJM
+503-Alicia Keys - Girl on Fire (Official Video) J91ti_MpdHA
+504-Avril Lavigne - What The Hell (Official Music Video) tQmEd_UeeIk
+505-Zara Larsson - Uncover (Official Music Video) U-PXEe-qeK4
+506-Lady Gaga - Just Dance ft. Colby O'Donis (Official Music Video) ft. Colby O'Donis 2Abk1jAONjw
+507-Maluma - La Temperatura (Video) ft. Eli Palacios Tgt6iaSYMEM
+508-Akon - Sorry, Blame It On Me (Official Music Video) ynMk2EwRi4Q
+509-CNCO, Yandel - Hey DJ (Official Video) X6wQOW9ihDA
+510-Selena Gomez & The Scene - Naturally a_YR4dKArgo
+511-Eminem - Space Bound (Official Video) JByDbPn6A1o
+512-YG - My Nigga ft. Jeezy, Rich Homie Quan (Explicit) (Official Music Video) MSrTnWDTdwI
+513-August Alsina - No Love ft. Nicki Minaj nxvm4P0jFKY
+514-Farruko - Obsesionado (Official Video) lkN51aqPOzU
+515-Rihanna - Hate That I Love You ft. Ne-Yo KMOOr7GEkj8
+516-Madonna - Bitch I'm Madonna ft. Nicki Minaj 7hPMmzKs62w
+517-Selena Gomez & The Scene - Who Says BzE1mX4Px0I
+518-Ariana Grande - One Last Time (Official) BPgEgaPk62M
+519-Calvin Harris - Sweet Nothing (Official Video) ft. Florence Welch 17ozSeGw-fY
+520-Maroon 5 - Misery (Official Music Video) 6g6g2mvItp4
+521-Jay-Z & Kanye West - Ni_as In Paris (Explicit) gG_dA32oH44
+523-Beyoncé - Sorry (Video) QxsmWxxouIM
+524-The Weeknd - Reminder (Official Video) JZjAg6fK-BQ
+525-Pusho - Te Fuiste ft. Ozuna aZOGcaU7q1A
+526-Jeremih - Down On Me ft. 50 Cent (Official Music Video) AaXaig_43lU
+527-Jordin Sparks, Chris Brown - No Air (Official Video) ft. Chris Brown WBKnpyoFEBo
+528-Marc Anthony - Valio La Pena (Salsa Version) Ns9YYSqLxyI
+529-Prince Royce - Back It Up (Official Video) ft. Jennifer Lopez, Pitbull 9w9dXWU5nMI
+530-Eminem - Cleanin' Out My Closet (Official Music Video) RQ9_TKayu9s
+531-Chris Brown - Kiss Kiss ft. T-Pain eNII9PDlFJ0
+532-Avicii vs Nicky Romero - I Could Be The One (Nicktim) bek1y2uiQGA
+533-Jessie J - Domino (Official Video) UJtB55MaoD0
+534-Don Omar - Zumba Campaign Video 8HpG0l9cLos
+535-Britney Spears - Womanizer (Director's Cut) (Official HD Video) rMqayQ-U74s
+536-Demi Lovato - Confident (Official Video) cwLRQn61oUY
+537-Usher - DJ Got Us Fallin' In Love (Official Music Video) ft. Pitbull C-dvTjK_07c
+538-Beyoncé - Pretty Hurts (Video) LXXQLa-5n5w
+539-Akon - I Wanna Love You ft. Snoop Dogg GJzF7H2e3Tw
+540-Of Monsters And Men - Little Talks (Official Video) ghb6eDopW8I
+541-Enrique Iglesias - I Like It (Official Music Video) X9_n8jakvWU
+542-Michael Jackson, Justin Timberlake - Love Never Felt So Good (Official Video) oG08ukJPtR8
+543-Akon - Beautiful (Official Music Video) ft. Colby O'Donis, Kardinal Offishall rSOzN0eihsE
+544-Farruko - Sunset (Official Video) ft. Shaggy, Nicky Jam ZBMsSPR9QMg
+545-Ace Hood - Bugatti (Official Music Video) (Explicit) ft. Future, Rick Ross djE-BLrdDDc
+546-Jennifer Lopez - I'm Into You ft. Lil Wayne IgLcQmlN2Xg
+547-Calibre 50 - Contigo oeeNs3KInbc
+548-will.i.am - Feelin' Myself ft. Miley Cyrus, Wiz Khalifa, French Montana VRuoR--LdqQ
+549-Diddy - Dirty Money - Coming Home ft. Skylar Grey (Official Video) k-ImCpNqbJw
+550-Bon Jovi - You Give Love A Bad Name (Official Music Video) KrZHPOeOxQQ
+551-Chamillionaire - Ridin' (Official Music Video) ft. Krayzie Bone CtwJvgPJ9xw
+552-Zedd - Clarity ft. Foxes (Official Music Video) IxxstCcJlsc
+553-Justin Bieber - Confident ft. Chance The Rapper (Official Music Video) 47YClVMlthI
+554-Lana Del Rey - Blue Jeans JRWox-i6aAk
+555-blink-182 - I Miss You (Official Video) s1tAYmMjLdY
+556-Fergie - M.I.L.F. $ (Official Music Video) bsUWK-fixiA
+557-Taylor Swift - Mine XPBwXKgDTdE
+558-Three Days Grace - I Hate Everything About You (Official Video) d8ekz_CSBVg
+559-T.I. - About The Money ft. Young Thug (Official Music Video) etfIdtm-OC8
+560-will.i.am - This Is Love ft. Eva Simons (Official Music Video) 9I9Ar6upx34
+561-Kid Ink - Show Me (Explicit) ft. Chris Brown xKkb13IU_DE
+562-will.i.am - #thatPOWER ft. Justin Bieber (Official Music Video) DGIgXP9SvB8
+563-Marc Anthony - Ahora Quien (Salsa Version) toLrTToaN0M
+564-Future - Where Ya At ft. Drake lw3Or6eqIpI
+565-Taylor Swift - …Ready For It wIft-t-MQuE
+566-Young Money - Bed Rock (Official Music Video) Ha80ZaecGkQ
+567-Romeo Santos - Promise ft. Usher Y3XyWhrZnqE
+568-Chris Brown, Tyga - Ayo (Official Video) zKCrSN9oXgQ
+569-Hailee Steinfeld - Love Myself (Official Video) bMpFmHSgC4Q
+570-Justin Bieber - Never Let You Go (Official Music Video) 3ExWsVFJlFo
+571-Rihanna - California King Bed nhBorPm6JjQ
+572-Ways to Be Wicked (from Descendants 2) (Official Video) lX6g_cm2rM4
+573-Leona Lewis - Bleeding Love (US Version - Official Video) Vzo-EL_62fQ
+574-Labrinth - Beneath Your Beautiful (Official Video) ft. Emeli Sandé bqIxCtEveG8
+575-Marc Anthony - A Quién Quiero Mentirle (Video) GeApuPcMVeQ
+576-Chris Brown - Next To You (Official Music Video) ft. Justin Bieber EEuQU6a90Pc
+577-Sia - Big Girls Cry (Official Video) 4NhKWZpkw1Q
+578-Shakira - She Wolf (Official HD Video) booKP974B0k
+579-Jay Sean - Down ft. Lil Wayne (Official Music Video) oUbpGmR1-QM
+581-Avicii - Addicted To You Qc9c12q3mrc
+582-Pitbull - Hotel Room Service (Official Video) 2up_Eq6r6Ko
+583-Lady Gaga - Paparazzi (Official Music Video) d2smz_1L2_0
+584-Chris Brown - Yeah 3x 3mC2ixOAivA
+585-Nick Jonas - Close ft. Tove Lo XgJFqVvb2Ws
+586-Hailee Steinfeld, Grey - Starving ft. Zedd (Official Video) xwjwCFZpdns
+587-Ne-Yo - One In A Million (Official Music Video) 6tpl9LtkRRw
+588-Lady Gaga - Born This Way (Official Music Video) wV1FrqwZyKw
+589-Chris Brown - Turn Up the Music eQWG8BVeryU
+590-Juan Magan - Si No Te Quisiera ft. Belinda, Lapiz Conciente XoNCV9BsU9c
+591-Katy Perry - Teenage Dream (Official Music Video) 98WtmW-lfeE
+592-Cher Lloyd - Want U Back (US Version) (Official Music Video) LPgvNlrBfb0
+593-Shakira - Addicted to You MntbN1DdEP0
+594-Sebastián Yatra - Alguien Robó ft. Wisin, Nacho EH0Wg8SaITQ
+595-Chris Brown - Forever (Official HD Video) 5sMKX22BHeE
+596-Snow Patrol - Chasing Cars (Official Video) GemKqzILV4w
+597-Lil Wayne - Drop The World ft. Eminem (Official Music Video) ft. Eminem ErCAOMi5EGM
+598-Miley Cyrus - 7 Things (Official Video) Hr0Wv5DJhuk
+599-Matheus & Kauan - O Nosso Santo Bateu – Na Praia Ao Vivo kbCtpDwVCLQ
+600-Maejor Ali - Lolly ft. Juicy J, Justin Bieber BiOmXeKyrxo
+601-Kings Of Leon - Use Somebody (Official Video) gnhXHvRoUd0
+602-Pitbull - Fireball ft. John Ryan HMqgVXSvwGo
+603-Calvin Harris - Feel So Close (Official Video) dGghkjpNCQ8
+604-Carly Rae Jepsen - I Really Like You qV5lzRHrGeg
+605-Demi Lovato - Skyscraper (Official Video) r_8ydghbGSg
+606-Keri Hilson - Knock You Down (Official Music Video) ft. Kanye West, Ne-Yo p_RqWocthcc
+607-Wisin & Yandel - Te Siento SKWxOsbt9gU
+608-The Black Eyed Peas - Just Can't Get Enough (Official Music Video) OrTyD7rjBpw
+609-Jennifer Lopez - Live It Up ft. Pitbull BofL1AaiTjo
+610-Eminem - Just Lose It (Official Music Video) 9dcVOmEQzKA
+612-The Black Eyed Peas - Don't Stop The Party (Official Music Video) u9LH_y159sg
+613-Tinie Tempah - Written In The Stars ft. Eric Turner YgFyi74DVjc
+614-Big Sean - Blessings (Official Explicit Video) ft. Drake, Kanye West M6t47RI4bns
+615-Britney Spears - I Wanna Go (Official Video) T-sxSd1uwoU
+616-Rihanna - Russian Roulette (Official Music Video) ZQ2nCGawrSY
+617-Ellie Goulding - On My Mind (Official Video) H202k7KfZL0
+618-Pitbull - Hey Baby (Drop It To The Floor) ft. T-Pain LefQdEMJP1I
+619-Maître Gims - J'me tire (Clip officiel) F_rEHfLgdcY
+620-LMFAO - Champagne Showers ft. Natalia Kills UA8rcLvS1BY
+621-Nicki Minaj - Pound The Alarm (Explicit) vdrqA93sW-8
+622-Maluma - La Curiosidad 9t7eMteW-Tc
+623-Shakira - Rabiosa (English Version) ft. Pitbull a5irTX82olg
+624-Rich Homie Quan - Type of Way (Official Video) -KKbdErJkiY
+625-P!nk - Just Like Fire (From'Alice Through The Looking Glass' - Official Video) 5Nrv5teMc9Y
+626-Rihanna - What Now (Official) b-3BI9AspYc
+627-Camila - De Que Me Sirve la Vida 3YhoejhnW8w
+628-Jennifer Lopez - Goin' In ft. Flo Rida z5W7DVFKrcs
+629-LMFAO ft. Lil Jon - Shots (Official Video) XNtTEibFvlQ
+630-Ciara - Like A Boy (Official Video) _HKH7Emy1SY
+631-Calvin Harris & Alesso - Under Control (Official Video) ft. Hurts yZqmarGShxg
+632-Fifth Harmony - BO$$ (BOSS) Y4JfPlry-iQ
+633-Eminem - Berzerk (Official Music Video) (Explicit) ab9176Srb5Y
+634-Years & Years - King (Official Video) g_uoH6hJilc
+635-Ne-Yo - So Sick (Official Music Video) IxszlJppRQI
+636-Selena Gomez & The Scene - A Year Without Rain M8uPvX2te0I
+637-Daddy Yankee - Sabado Rebelde ft. Plan B 0nPkXDrL2ZU
+638-Kanye West - All Of The Lights ft. Rihanna, Kid Cudi HAfFfqiYLp0
+639-Zedd - Stay The Night ft. Hayley Williams (Official Music Video) i-gyZ35074k
+640-Yandel - Como Antes (Official Video) ft. Wisin QeaumjX9DNY
+641-Taylor Swift - Back To December QUwxKWT6m7U
+642-Romeo Santos - Rival (Official Video) ft. Mario Domm 6vMhhBRj-2Q
+643-Henrique & Diego - Suíte 14 (Ao Vivo) ft. Mc Guimê gmvFLIuVAbA
+644-Britney Spears - Gimme More (Official HD Video) elueA2rofoo
+645-Rihanna - You Da One b3HeLs8Yosw
+646-Avicii - Hey Brother 6Cp6mKbRTQY
+647-Soulja Boy Tell'em ft. Sammie - Kiss Me Thru The Phone (Official Video) 47Fbo4kU2AU
+648-Beyoncé - Partition (Explicit Video) pZ12_E5R3qc
+649-Kid Cudi - Pursuit Of Happiness (Official Music Video) ft. MGMT 7xzU9Qqdqww
+650-Sigala - Sweet Lovin' ft. Bryn Christopher (Official Video) qj5zT4t7S6c
+651-The Game - My Life ft. Lil Wayne (Official Music Video) udxZ9zkDzpo
+652-Nicki Minaj - Moment 4 Life (Clean Version) (Official Music Video) ft. Drake D7GW8TYCEG4
+653-Nicki Minaj - High School (Explicit) ft. Lil Wayne RnpyRe_7jZA
+654-Chino & Nacho - Niña Bonita Oe1fRwgGu5E
+655-Far East Movement ft. The Cataracs, DEV - Like A G6 (Official Video) w4s6H4ku6ZY
+656-Pitbull, Ne-Yo - Time Of Our Lives bTXJQ5ql5Fw
+657-Lorde - Team f2JuxM-snGc
+658-Christina Aguilera - Candyman (Official Music Video) -ScjucUV8v0
+659-Katy Perry - I Kissed A Girl (Official) tAp9BKosZXs
+660-One Direction - Gotta Be You nvfejaHz-o0
+661-Nicki Minaj - Pills N Potions (Official) f7ld-3nZUxA
+662-Mohombi - Bumpy Ride G2RCCDSBEGk
+663-Demi Lovato - Neon Lights (Official Video) v9uDwppN5-w
+664-The Pussycat Dolls - When I Grow Up (Official Music Video) K0K46C82v9o
+665-Chris Brown - Don't Wake Me Up (Official Music Video) QOowQeKyNkQ
+666-Christina Aguilera - Hurt (Main Video) wwCykGDEp7M
+667-Eminem - We Made You (Official Music Video) RSdKmX2BH7o
+668-Taio Cruz - Break Your Heart (Official Video) ft. Ludacris y_SI2EDM6Lo
+669-Demi Lovato - Really Don't Care ft. Cher Lloyd (Official Video) OJGUbwVMBeA
+670-P!nk - Raise Your Glass (Official Video) XjVNlG5cZyQ
+671-Austin Mahone - Mmm Yeah ft. Pitbull MMAppa1cAVo
+672-Avril Lavigne - Smile (Official Music Video) KagvExF-ijc
+673-Little Mix - Wings (Official Video) cOQDsmEqVt8
+674-Brandon Beal - Twerk It Like Miley - Produced by Hedegaard ft. Christopher PLE57UZievU
+675-El Bebeto - No Te Creas Tan Importante (Video Oficial) nMv2PeG-2mc
+676-Eminem - My Name Is (Official Music Video) sNPnbI1arSE
+677-Justin Bieber - All That Matters JC2yu2a9sHk
+678-The Wanted - Glad You Came 2ggzxInyzVE
+679-Maluma - Addicted (Official Music Video) pMIHC_cItd4
+680-Pitbull - Fun (Official Video) ft. Chris Brown jKbR7u8J5PU
+681-Thalia - Desde Esa Noche (Official Video) ft. Maluma CkyBXdXkMr8
+682-Michael Jackson - You Rock My World (Official Video) 1-7ABIM2qjU
+683-The Band Perry - If I Die Young (Official Video) 7NJqUN9TClM
+684-Alessia Cara - Here (Official Video) UKp2CrfmVfw
+685-Güliz Ayla - Olmazsan Olmaz j-T4hRJNFJI
+686-Rihanna - Disturbia E1mU6h4Xdxc
+687-Beyoncé - Diva rNM5HW13_O8
+688-Bridgit Mendler - Ready or Not (Official Video) dPKG1-3LXBs
+689-Nicki Minaj - Beez In The Trap (Explicit) ft. 2 Chainz EmZvOhHF85I
+691-Snoop Dogg - 'Sweat' Snoop Dogg vs David Guetta (Remix) [Official Video] KnEXrbAQyIo
+692-Olly Murs - Troublemaker ft. Flo Rida 4aQDOUbErNg
+693-Ciara ft. Ludacris - Ride (Official Video) Lp6W4aK1sbs
+694-Chris Brown - Don't Judge Me z29nI8RQV0U
+695-Kendrick Lamar - LOYALTY. ft. Rihanna Dlh-dzB2U4Y
+696-Chris Brown - Love More (Explicit) ft. Nicki Minaj Tff2oE31Mlw
+697-Christina Aguilera - Your Body (Official Music Video) (Clean Version) 6cfCgLgiFDM
+699-MIKA - Popular Song ft. Ariana Grande nmcdLOjGVzw
+700-Britney Spears - Till The World Ends (Official Video) qzU9OrZlKb8
+701-Zendaya - Replay cyLE48i4XY0
+702-Shontelle - Impossible (Official Video) NWdrO4BoCu8
+703-Nicki Minaj, Cassie - The Boys (Explicit) kXFcr6oy5dk
+704-Miley Cyrus - Can't Be Tamed (Official Video) sjSG6z_13-Q
+705-Little Mix - Move (Official Video) RwD4eJGxPc4
+706-Selena Gomez - Tell Me Something I Don't Know _RRyniZG0Jo
+707-Britney Spears - Circus (Official HD Video) lVhJ_A8XUgc
+708-Avril Lavigne - Here's to Never Growing Up sXd2WxoOP5g
+709-Lady Gaga - LoveGame (Official Music Video) 1mB0tP1I-14
+710-Chris Brown - Gimme That (remix) ft. Lil Wayne 3yl-5FOZcr0
+711-Beyoncé - Sweet Dreams JlxByc0-V40
+712-Leona Lewis - Bleeding Love (Official Video) 7_weSk0BonM
+713-Taio Cruz - Dynamite (Official Video) VUjdiDeJ0xg
+714-Tinashe - 2 On (Explicit) ft. SchoolBoy Q -s7TCuCpB5c
+715-Natalie La Rose - Somebody ft. Jeremih 8zqdo_Umd5c
+717-Usher - OMG ft. will.i.am 1RnPB76mjxI
+718-Taylor Swift - Our Song Jb2stN7kH28
+719-Lil Wayne - How To Love (Official Music Video) y8Gf4-eT3w0
+720-Nicole Scherzinger - Right There ft. 50 Cent t-vTaktsUSw
+721-OneRepublic - Good Life (Official Music Video) jZhQOvvV45w
+722-Britney Spears, Iggy Azalea - Pretty Girls (Official Video) uV2uebhnqOw
+723-Ellie Goulding - Lights (Official Video) 0NKUpo_xKyQ
+724-Miley Cyrus - Adore You (Official Video) W1tzURKYFNs
+725-Kanye West - Heartless Co0tTeuUVhU
+726-Rihanna - Te Amo (Official Music Video) Oe4Ic7fHWf8
+727-Ariana Grande - Baby I bJuWlMFToNo
+728-Vanessa Hudgens - Say Ok (Official Music Video) F5VvvVxuKko
+729-DJ Khaled - I'm On One (Explicit Version) ft. Drake, Rick Ross, Lil Wayne Z09lYqdxqzo
+730-Demi Lovato - Made in the USA (Official Video) z3zdIHDTbg0
+731-Train - Drive By (Official Music Video) oxqnFJ3lp5k
+732-Eminem - The Way I Am (Official Music Video) mQvteoFiMlg
+733-Timbaland - Carry Out (Official Music Video) ft. Justin Timberlake NRdHsuuXxfk
+734-Daddy Yankee - La Noche De Los Dos ft. Natalia Jiménez GDBaeQ5JPuU
+735-Justin Bieber - U Smile (Official Music Video) r2ozuCXpVJY
+736-Ke$ha - Die Young (Official) NOubzHCUt48
+737-Nick Jonas - Jealous yw04QD1LaB0
+739-Ariana Grande - Right There ft. Big Sean fhcpubAVdmc
+740-Selena Gomez & The Scene - Hit The Lights 8c2ahBlTPz0
+741-Eminem - Survival (Explicit) NlmezywdxPI
+742-Miley Cyrus - Who Owns My Heart (Official Video) iVbQxC2c3-8
+743-Rihanna - Cheers (Drink To That) ZR0v0i63PQ4
+744-Sigala - Easy Love (Official Music Video) ozx898ADTxM
+745-Farruko - Besas Tan Bien (Official Video) E-kkX2UuBcg
+746-OneRepublic - All The Right Moves (Official Music Video) qrOeGCJdZe4
+747-Enrique Iglesias, Usher - Dirty Dancer ft. Lil Wayne vHJAUuicC0Q
+748-Austin Mahone - What About Love (Official Video) 2PEG82Udb90
+749-Rihanna - Hard (Official Music Video) ft. Jeezy Xcwd_Nz6Zog
+750-Lady Gaga - Perfect Illusion (Official Music Video) Xn599R0ZBwg
+752-MGMT - Electric Feel (Official HD Video) MmZexg8sxyk
+753-'Weird Al' Yankovic - White & Nerdy (Official Music Video) N9qYF9DZPdw
+754-Taylor Swift - White Horse D1Xr-JFLxik
+755-Miley Cyrus - When I Look At You (Official Video) 8wxOVn99FTE
+756-Ne-Yo - Let Me Love You (Until You Learn To Love Yourself) (Official Music Video) crrOl0egI00
+757-Cher Lloyd - Oath (Official Music Video) ft. Becky G Cqz713hhz1Y
+758-Timbaland - If We Ever Meet Again ft. Katy Perry (Official Music Video) KDKva-s_khY
+759-'Watch Me' from Disney Channel's 'Shake It Up' (Official Video) PPNMGYOm1aM
+761-Taio Cruz - Hangover (Official Video) ft. Flo Rida dLhFDYQHDQY
+762-Daddy Yankee - Ven Conmigo ft. Prince Royce ZEInlYjVFzk
+765-Demi Lovato - La La Land (Official Music Video) nmjO1p9Oxrk
+766-Selena Gomez & The Scene - Round & Round UfcvO2t8Ntg
+767-Britney Spears - Hold It Against Me (Official Video) -Edv8Onsrgg
+768-Far East Movement - Turn Up The Love ft. Cover Drive UqXVgAmqBOs
+769-Justin Bieber - Pray (Official Music Video) o9tJW9MDs2M
+770-Drake - Find Your Love (Official Music Video) Xyv4Bjja8yc
+772-Nicki Minaj - Va Va Voom (Explicit) 3U72hzeBLOw
+773-will.i.am, Nicki Minaj - Check It Out (Official Music Video) pqky5B179nM
+774-Nicki Minaj - Stupid Hoe (Explicit) T6j4f8cHBIM
+775-Jennifer Lopez ft. French Montana - I Luh Ya Papi (Explicit) [Official Video] c4oiEhf9M04
diff --git a/dataset/vevo_meta/top_chord.txt b/dataset/vevo_meta/top_chord.txt
new file mode 100644
index 0000000000000000000000000000000000000000..26456cc45487d0d14cc23aae16e163403dc01995
--- /dev/null
+++ b/dataset/vevo_meta/top_chord.txt
@@ -0,0 +1,30 @@
+C 1 32576
+A:min 122 31898
+F 66 22538
+G 92 22137
+E:min 57 7935
+D:min 31 6457
+D 27 3973
+A:min7 121 3846
+A 118 3606
+E 53 2613
+D:min7 30 2598
+F:maj7 78 2530
+A# 131 1854
+E:min7 56 1695
+E:7 63 1396
+G:7 102 1321
+C:maj7 13 1039
+C:7 11 791
+D:7 37 697
+G:min 96 685
+C:min 5 684
+B:min 148 528
+F:min 70 474
+B 144 459
+D# 40 459
+G# 105 452
+A:7 128 391
+F:7 76 384
+G:sus4 94 384
+G:min7 95 277
diff --git a/default_sound_font.sf2 b/default_sound_font.sf2
new file mode 100644
index 0000000000000000000000000000000000000000..14b4bfccc13d330c811e8a2b4630d314173e40fe
--- /dev/null
+++ b/default_sound_font.sf2
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74594e8f4250680adf590507a306655a299935343583256f3b722c48a1bc1cb0
+size 148398306
diff --git a/model/__pycache__/music_transformer.cpython-37.pyc b/model/__pycache__/music_transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c092bb158323b32021470593d5d8c0041adb85c
Binary files /dev/null and b/model/__pycache__/music_transformer.cpython-37.pyc differ
diff --git a/model/__pycache__/positional_encoding.cpython-37.pyc b/model/__pycache__/positional_encoding.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43d4430de58959375ba40dc1250804bf7fd81868
Binary files /dev/null and b/model/__pycache__/positional_encoding.cpython-37.pyc differ
diff --git a/model/__pycache__/positional_encoding.cpython-38.pyc b/model/__pycache__/positional_encoding.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a418d963e1a78f4e87f310189c714cd49776b1e
Binary files /dev/null and b/model/__pycache__/positional_encoding.cpython-38.pyc differ
diff --git a/model/__pycache__/rpr.cpython-37.pyc b/model/__pycache__/rpr.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4deeff2fa2df843f3927bf9280e008b82e76ae20
Binary files /dev/null and b/model/__pycache__/rpr.cpython-37.pyc differ
diff --git a/model/__pycache__/rpr.cpython-38.pyc b/model/__pycache__/rpr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3418d5d9daa7e7553a735bad69e6dfe1cb3917a9
Binary files /dev/null and b/model/__pycache__/rpr.cpython-38.pyc differ
diff --git a/model/__pycache__/video_music_transformer.cpython-37.pyc b/model/__pycache__/video_music_transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b4b3efb1730d7849ae97674a26859debe332cc9
Binary files /dev/null and b/model/__pycache__/video_music_transformer.cpython-37.pyc differ
diff --git a/model/__pycache__/video_music_transformer.cpython-38.pyc b/model/__pycache__/video_music_transformer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cf86a1bfae9baf9d0733a0e96b79845a95acb7b
Binary files /dev/null and b/model/__pycache__/video_music_transformer.cpython-38.pyc differ
diff --git a/model/__pycache__/video_regression.cpython-37.pyc b/model/__pycache__/video_regression.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e4dd9d485e65d7310362da2cceb6bb3b7d5482b
Binary files /dev/null and b/model/__pycache__/video_regression.cpython-37.pyc differ
diff --git a/model/__pycache__/video_regression.cpython-38.pyc b/model/__pycache__/video_regression.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff92ea3fd9606ff5515b9132657ba899ca62c1
Binary files /dev/null and b/model/__pycache__/video_regression.cpython-38.pyc differ
diff --git a/model/loss.py b/model/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ff3d5d39a13ced021e1b9ca27973804a3262e7
--- /dev/null
+++ b/model/loss.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+
+# Borrowed from https://github.com/jason9693/MusicTransformer-pytorch/blob/5f183374833ff6b7e17f3a24e3594dedd93a5fe5/custom/criterion.py#L28
+class SmoothCrossEntropyLoss(_Loss):
+ """
+ https://arxiv.org/abs/1512.00567
+ """
+ __constants__ = ['label_smoothing', 'vocab_size', 'ignore_index', 'reduction']
+
+ def __init__(self, label_smoothing, vocab_size, ignore_index=-100, reduction='mean', is_logits=True):
+ assert 0.0 <= label_smoothing <= 1.0
+ super().__init__(reduction=reduction)
+
+ self.label_smoothing = label_smoothing
+ self.vocab_size = vocab_size
+ self.ignore_index = ignore_index
+ self.input_is_logits = is_logits
+
+ def forward(self, input, target):
+ """
+ Args:
+ input: [B * T, V]
+ target: [B * T]
+ Returns:
+ cross entropy: [1]
+ """
+ mask = (target == self.ignore_index).unsqueeze(-1)
+ q = F.one_hot(target.long(), self.vocab_size).type(torch.float32)
+ u = 1.0 / self.vocab_size
+ q_prime = (1.0 - self.label_smoothing) * q + self.label_smoothing * u
+ q_prime = q_prime.masked_fill(mask, 0)
+
+ ce = self.cross_entropy_with_logits(q_prime, input)
+ if self.reduction == 'mean':
+ lengths = torch.sum(target != self.ignore_index)
+ return ce.sum() / lengths
+ elif self.reduction == 'sum':
+ return ce.sum()
+ else:
+ raise NotImplementedError
+
+ def cross_entropy_with_logits(self, p, q):
+ return -torch.sum(p * (q - q.logsumexp(dim=-1, keepdim=True)), dim=-1)
diff --git a/model/music_transformer.py b/model/music_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..633b2b335c098ae2c19d023a5ece8424e559034c
--- /dev/null
+++ b/model/music_transformer.py
@@ -0,0 +1,177 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+
+from utilities.constants import *
+from utilities.device import get_device
+
+from .positional_encoding import PositionalEncoding
+from .rpr import TransformerEncoderRPR, TransformerEncoderLayerRPR
+import json
+# MusicTransformer
+class MusicTransformer(nn.Module):
+ def __init__(self, n_layers=6, num_heads=8, d_model=512, dim_feedforward=1024,
+ dropout=0.1, max_sequence_midi=2048, max_sequence_chord=300, rpr=False):
+ super(MusicTransformer, self).__init__()
+
+ self.dummy = DummyDecoder()
+ self.nlayers = n_layers
+ self.nhead = num_heads
+ self.d_model = d_model
+ self.d_ff = dim_feedforward
+ self.dropout = dropout
+ self.max_seq_midi = max_sequence_midi
+ self.max_seq_chord = max_sequence_chord
+ self.rpr = rpr
+
+ # Input embedding for video and music features
+ self.embedding = nn.Embedding(CHORD_SIZE, self.d_model)
+
+ # self.embedding_key = nn.Embedding(1, self.d_model)
+ self.embedding_root = nn.Embedding(CHORD_ROOT_SIZE, self.d_model)
+ self.embedding_attr = nn.Embedding(CHORD_ATTR_SIZE, self.d_model)
+
+ self.positional_encoding = PositionalEncoding(self.d_model, self.dropout, self.max_seq_chord)
+ self.Linear_chord = nn.Linear(self.d_model+1, self.d_model)
+
+ # Base transformer
+ if(not self.rpr):
+ self.transformer = nn.Transformer(
+ d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+ num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+ dim_feedforward=self.d_ff, custom_decoder=self.dummy
+ )
+ # RPR Transformer
+ else:
+ encoder_norm = LayerNorm(self.d_model)
+ encoder_layer = TransformerEncoderLayerRPR(self.d_model, self.nhead, self.d_ff, self.dropout, er_len=self.max_seq_chord)
+
+ encoder = TransformerEncoderRPR(encoder_layer, self.nlayers, encoder_norm)
+ self.transformer = nn.Transformer(
+ d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+ num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+ dim_feedforward=self.d_ff, custom_decoder=self.dummy, custom_encoder=encoder
+ )
+ # Final output is a softmaxed linear layer
+ self.Wout = nn.Linear(self.d_model, CHORD_SIZE)
+ self.Wout_root = nn.Linear(self.d_model, CHORD_ROOT_SIZE)
+ self.Wout_attr = nn.Linear(self.d_model, CHORD_ATTR_SIZE)
+ self.softmax = nn.Softmax(dim=-1)
+
+ # forward
+ def forward(self, x, x_root, x_attr, feature_key, mask=True):
+ if(mask is True):
+ mask = self.transformer.generate_square_subsequent_mask(x.shape[1]).to(get_device())
+ else:
+ mask = None
+
+ ### Chord + Key (DECODER) ###
+ # x = self.embedding(x)
+
+ x_root = self.embedding_root(x_root)
+ x_attr = self.embedding_attr(x_attr)
+ x = x_root + x_attr
+
+ feature_key_padded = torch.full((x.shape[0], x.shape[1], 1), feature_key.item())
+ feature_key_padded = feature_key_padded.to(get_device())
+ x = torch.cat([x, feature_key_padded], dim=-1)
+ xf = self.Linear_chord(x)
+
+ ### POSITIONAL ENCODING ###
+ xf = xf.permute(1,0,2) # -> (max_seq-1, batch_size, d_model)
+ xf = self.positional_encoding(xf)
+
+ ### TRANSFORMER ###
+ x_out = self.transformer(src=xf, tgt=xf, tgt_mask=mask)
+ x_out = x_out.permute(1,0,2)
+
+ if IS_SEPERATED:
+ y_root = self.Wout_root(x_out)
+ y_attr = self.Wout_attr(x_out)
+ del mask
+ return y_root, y_attr
+ else:
+ y = self.Wout(x_out)
+ del mask
+ return y
+
+ # generate
+ def generate(self, feature_key=None, primer=None, primer_root=None, primer_attr=None, target_seq_length=300, beam=0, beam_chance=1.0):
+ assert (not self.training), "Cannot generate while in training mode"
+
+ with open('dataset/vevo_meta/chord_inv.json') as json_file:
+ chordInvDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_root.json') as json_file:
+ chordRootDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_attr.json') as json_file:
+ chordAttrDic = json.load(json_file)
+
+ print("Generating sequence of max length:", target_seq_length)
+ gen_seq = torch.full((1,target_seq_length), CHORD_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+ gen_seq_root = torch.full((1,target_seq_length), CHORD_ROOT_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+ gen_seq_attr = torch.full((1,target_seq_length), CHORD_ATTR_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+
+ num_primer = len(primer)
+
+ gen_seq[..., :num_primer] = primer.type(TORCH_LABEL_TYPE).to(get_device())
+ gen_seq_root[..., :num_primer] = primer_root.type(TORCH_LABEL_TYPE).to(get_device())
+
+ gen_seq_attr[..., :num_primer] = primer_attr.type(TORCH_LABEL_TYPE).to(get_device())
+
+ cur_i = num_primer
+ while(cur_i < target_seq_length):
+ # gen_seq_batch = gen_seq.clone()
+ # y = self.softmax(self.forward(gen_seq[..., :cur_i]))[..., :CHORD_END]
+ y = self.softmax( self.forward( gen_seq[..., :cur_i], gen_seq_root[..., :cur_i], gen_seq_attr[..., :cur_i], feature_key) )[..., :CHORD_END]
+
+ token_probs = y[:, cur_i-1, :]
+ if(beam == 0):
+ beam_ran = 2.0
+ else:
+ beam_ran = random.uniform(0,1)
+ if(beam_ran <= beam_chance):
+ token_probs = token_probs.flatten()
+ top_res, top_i = torch.topk(token_probs, beam)
+ beam_rows = top_i // CHORD_SIZE
+ beam_cols = top_i % CHORD_SIZE
+ gen_seq = gen_seq[beam_rows, :]
+ gen_seq[..., cur_i] = beam_cols
+ else:
+ distrib = torch.distributions.categorical.Categorical(probs=token_probs)
+ next_token = distrib.sample()
+ #print("next token:",next_token)
+ gen_seq[:, cur_i] = next_token
+ gen_chord = chordInvDic[ str( next_token.item() ) ]
+
+ chord_arr = gen_chord.split(":")
+ if len(chord_arr) == 1:
+ chordRootID = chordRootDic[chord_arr[0]]
+ chordAttrID = 1
+ chordRootID = torch.tensor([chordRootID]).to(get_device())
+ chordAttrID = torch.tensor([chordAttrID]).to(get_device())
+ gen_seq_root[:, cur_i] = chordRootID
+ gen_seq_attr[:, cur_i] = chordAttrID
+ elif len(chord_arr) == 2:
+ chordRootID = chordRootDic[chord_arr[0]]
+ chordAttrID = chordAttrDic[chord_arr[1]]
+ chordRootID = torch.tensor([chordRootID]).to(get_device())
+ chordAttrID = torch.tensor([chordAttrID]).to(get_device())
+ gen_seq_root[:, cur_i] = chordRootID
+ gen_seq_attr[:, cur_i] = chordAttrID
+
+ # Let the transformer decide to end if it wants to
+ if(next_token == CHORD_END):
+ print("Model called end of sequence at:", cur_i, "/", target_seq_length)
+ break
+
+ cur_i += 1
+ if(cur_i % 50 == 0):
+ print(cur_i, "/", target_seq_length)
+ return gen_seq[:, :cur_i]
+
+class DummyDecoder(nn.Module):
+ def __init__(self):
+ super(DummyDecoder, self).__init__()
+ def forward(self, tgt, memory, tgt_mask, memory_mask,tgt_key_padding_mask,memory_key_padding_mask):
+ return memory
diff --git a/model/positional_encoding.py b/model/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..5820a69d721c879cfcdb21db63b9aef83b98f2bb
--- /dev/null
+++ b/model/positional_encoding.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+import math
+
+# PositionalEncoding
+# Taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+class PositionalEncoding(nn.Module):
+
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
+ super(PositionalEncoding, self).__init__()
+ self.dropout = nn.Dropout(p=dropout)
+
+ pe = torch.zeros(max_len, d_model)
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0).transpose(0, 1)
+ self.register_buffer('pe', pe)
+
+ def forward(self, x):
+ x = x + self.pe[:x.size(0), :]
+ return self.dropout(x)
diff --git a/model/rpr.py b/model/rpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1573451715f8cbcdb8834bc11f7372441d843d95
--- /dev/null
+++ b/model/rpr.py
@@ -0,0 +1,455 @@
+import torch
+import torch.nn as nn
+
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+from torch.nn import Module
+from torch.nn.modules.transformer import _get_clones
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.init import *
+
+from torch.nn.functional import linear, softmax, dropout
+from torch.nn import MultiheadAttention
+from typing import Optional
+
+class TransformerDecoderRPR(Module):
+ def __init__(self, decoder_layer, num_layers, norm=None):
+ super(TransformerDecoderRPR, self).__init__()
+ self.layers = _get_clones(decoder_layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = norm
+
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
+ output = tgt
+ for mod in self.layers:
+ output = mod(output, memory, tgt_mask=tgt_mask,
+ memory_mask=memory_mask,
+ tgt_key_padding_mask=tgt_key_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask)
+
+ if self.norm is not None:
+ output = self.norm(output)
+
+ return output
+
+class TransformerDecoderLayerRPR(Module):
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
+ super(TransformerDecoderLayerRPR, self).__init__()
+
+ self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
+ self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+ # Implementation of Feedforward model
+ self.linear1 = Linear(d_model, dim_feedforward)
+ self.dropout = Dropout(dropout)
+ self.linear2 = Linear(dim_feedforward, d_model)
+
+ self.norm1 = LayerNorm(d_model)
+ self.norm2 = LayerNorm(d_model)
+ self.norm3 = LayerNorm(d_model)
+ self.dropout1 = Dropout(dropout)
+ self.dropout2 = Dropout(dropout)
+ self.dropout3 = Dropout(dropout)
+
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
+ tgt_key_padding_mask=None, memory_key_padding_mask=None):
+ tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
+ key_padding_mask=tgt_key_padding_mask)[0]
+ tgt = tgt + self.dropout1(tgt2)
+ tgt = self.norm1(tgt)
+
+ tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
+ key_padding_mask=memory_key_padding_mask)[0]
+
+ tgt = tgt + self.dropout2(tgt2)
+ tgt = self.norm2(tgt)
+ tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
+ tgt = tgt + self.dropout3(tgt2)
+ tgt = self.norm3(tgt)
+ return tgt
+
+# TransformerEncoderRPR (only for music transformer)
+class TransformerEncoderRPR(Module):
+ def __init__(self, encoder_layer, num_layers, norm=None):
+ super(TransformerEncoderRPR, self).__init__()
+ self.layers = _get_clones(encoder_layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = norm
+ def forward(self, src, mask=None, src_key_padding_mask=None):
+ output = src
+ for i in range(self.num_layers):
+ output = self.layers[i](output, src_mask=mask,
+ src_key_padding_mask=src_key_padding_mask)
+ if self.norm:
+ output = self.norm(output)
+ return output
+
+# TransformerEncoderLayerRPR (only for music transformer)
+class TransformerEncoderLayerRPR(Module):
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
+ super(TransformerEncoderLayerRPR, self).__init__()
+ self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
+ # Implementation of Feedforward model
+ self.linear1 = Linear(d_model, dim_feedforward)
+ self.dropout = Dropout(dropout)
+ self.linear2 = Linear(dim_feedforward, d_model)
+ self.norm1 = LayerNorm(d_model)
+ self.norm2 = LayerNorm(d_model)
+ self.dropout1 = Dropout(dropout)
+ self.dropout2 = Dropout(dropout)
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
+ src2 = self.self_attn(src, src, src, attn_mask=src_mask,
+ key_padding_mask=src_key_padding_mask)[0]
+ src = src + self.dropout1(src2)
+ src = self.norm1(src)
+ src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
+ src = src + self.dropout2(src2)
+ src = self.norm2(src)
+ return src
+
+# MultiheadAttentionRPR
+class MultiheadAttentionRPR(Module):
+ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, er_len=None):
+ super(MultiheadAttentionRPR, self).__init__()
+ self.embed_dim = embed_dim
+ self.kdim = kdim if kdim is not None else embed_dim
+ self.vdim = vdim if vdim is not None else embed_dim
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+ self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+
+ if self._qkv_same_embed_dim is False:
+ self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+ self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+ self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+
+ if bias:
+ self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+ else:
+ self.register_parameter('in_proj_bias', None)
+ self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+
+ if add_bias_kv:
+ self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+ self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+ else:
+ self.bias_k = self.bias_v = None
+
+ self.add_zero_attn = add_zero_attn
+
+ # Adding RPR embedding matrix
+ if(er_len is not None):
+ self.Er = Parameter(torch.rand((er_len, self.head_dim), dtype=torch.float32))
+ else:
+ self.Er = None
+
+ self._reset_parameters()
+
+ def _reset_parameters(self):
+ if self._qkv_same_embed_dim:
+ xavier_uniform_(self.in_proj_weight)
+ else:
+ xavier_uniform_(self.q_proj_weight)
+ xavier_uniform_(self.k_proj_weight)
+ xavier_uniform_(self.v_proj_weight)
+
+ if self.in_proj_bias is not None:
+ constant_(self.in_proj_bias, 0.)
+ constant_(self.out_proj.bias, 0.)
+ if self.bias_k is not None:
+ xavier_normal_(self.bias_k)
+ if self.bias_v is not None:
+ xavier_normal_(self.bias_v)
+
+ def forward(self, query, key, value, key_padding_mask=None,
+ need_weights=True, attn_mask=None):
+
+ if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
+
+ return multi_head_attention_forward_rpr(
+ query, key, value, self.embed_dim, self.num_heads,
+ self.in_proj_weight, self.in_proj_bias,
+ self.bias_k, self.bias_v, self.add_zero_attn,
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
+ training=self.training,
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
+ attn_mask=attn_mask, use_separate_proj_weight=True,
+ q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+ v_proj_weight=self.v_proj_weight, rpr_mat=self.Er)
+ else:
+ if not hasattr(self, '_qkv_same_embed_dim'):
+ warnings.warn('A new version of MultiheadAttention module has been implemented. \
+ Please re-train your model with the new module',
+ UserWarning)
+
+ return multi_head_attention_forward_rpr(
+ query, key, value, self.embed_dim, self.num_heads,
+ self.in_proj_weight, self.in_proj_bias,
+ self.bias_k, self.bias_v, self.add_zero_attn,
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
+ training=self.training,
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
+ attn_mask=attn_mask, rpr_mat=self.Er)
+
+# multi_head_attention_forward_rpr
+def multi_head_attention_forward_rpr(query, # type: Tensor
+ key, # type: Tensor
+ value, # type: Tensor
+ embed_dim_to_check, # type: int
+ num_heads, # type: int
+ in_proj_weight, # type: Tensor
+ in_proj_bias, # type: Tensor
+ bias_k, # type: Optional[Tensor]
+ bias_v, # type: Optional[Tensor]
+ add_zero_attn, # type: bool
+ dropout_p, # type: float
+ out_proj_weight, # type: Tensor
+ out_proj_bias, # type: Tensor
+ training=True, # type: bool
+ key_padding_mask=None, # type: Optional[Tensor]
+ need_weights=True, # type: bool
+ attn_mask=None, # type: Optional[Tensor]
+ use_separate_proj_weight=False, # type: bool
+ q_proj_weight=None, # type: Optional[Tensor]
+ k_proj_weight=None, # type: Optional[Tensor]
+ v_proj_weight=None, # type: Optional[Tensor]
+ static_k=None, # type: Optional[Tensor]
+ static_v=None, # type: Optional[Tensor]
+ rpr_mat=None
+ ):
+ """
+ ----------
+ Author: Pytorch
+ Modified: Damon Gwinn
+ ----------
+ For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+ https://pytorch.org/docs/1.2.0/_modules/torch/nn/functional.html
+ Modification to take RPR embedding matrix and perform skew optimized RPR (https://arxiv.org/abs/1809.04281)
+ ----------
+ """
+ # type: (...) -> Tuple[Tensor, Optional[Tensor]]
+
+ qkv_same = torch.equal(query, key) and torch.equal(key, value)
+ kv_same = torch.equal(key, value)
+
+ tgt_len, bsz, embed_dim = query.size()
+ assert embed_dim == embed_dim_to_check
+ assert list(query.size()) == [tgt_len, bsz, embed_dim]
+ assert key.size() == value.size()
+
+ head_dim = embed_dim // num_heads
+ assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+ scaling = float(head_dim) ** -0.5
+
+ if use_separate_proj_weight is not True:
+ if qkv_same:
+ # self-attention
+ q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+
+ elif kv_same:
+ # encoder-decoder attention
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = 0
+ _end = embed_dim
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ q = linear(query, _w, _b)
+
+ if key is None:
+ assert value is None
+ k = None
+ v = None
+ else:
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim
+ _end = None
+ _w = in_proj_weight[_start:, :]
+ if _b is not None:
+ _b = _b[_start:]
+ k, v = linear(key, _w, _b).chunk(2, dim=-1)
+
+ else:
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = 0
+ _end = embed_dim
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ q = linear(query, _w, _b)
+
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim
+ _end = embed_dim * 2
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ k = linear(key, _w, _b)
+
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim * 2
+ _end = None
+ _w = in_proj_weight[_start:, :]
+ if _b is not None:
+ _b = _b[_start:]
+ v = linear(value, _w, _b)
+ else:
+ q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+ len1, len2 = q_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == query.size(-1)
+
+ k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+ len1, len2 = k_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == key.size(-1)
+
+ v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+ len1, len2 = v_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == value.size(-1)
+
+ if in_proj_bias is not None:
+ q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+ k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+ v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+ else:
+ q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+ k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+ v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+ q = q * scaling
+
+ if bias_k is not None and bias_v is not None:
+ if static_k is None and static_v is None:
+ k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+ v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+ if attn_mask is not None:
+ attn_mask = torch.cat([attn_mask,
+ torch.zeros((attn_mask.size(0), 1),
+ dtype=attn_mask.dtype,
+ device=attn_mask.device)], dim=1)
+ if key_padding_mask is not None:
+ key_padding_mask = torch.cat(
+ [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+ dtype=key_padding_mask.dtype,
+ device=key_padding_mask.device)], dim=1)
+ else:
+ assert static_k is None, "bias cannot be added to static key."
+ assert static_v is None, "bias cannot be added to static value."
+ else:
+ assert bias_k is None
+ assert bias_v is None
+
+ q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+ if k is not None:
+ k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+ if v is not None:
+ v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+
+ if static_k is not None:
+ assert static_k.size(0) == bsz * num_heads
+ assert static_k.size(2) == head_dim
+ k = static_k
+
+ if static_v is not None:
+ assert static_v.size(0) == bsz * num_heads
+ assert static_v.size(2) == head_dim
+ v = static_v
+
+ src_len = k.size(1)
+
+ if key_padding_mask is not None:
+ assert key_padding_mask.size(0) == bsz
+ assert key_padding_mask.size(1) == src_len
+
+ if add_zero_attn:
+ src_len += 1
+ k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+ v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+ if attn_mask is not None:
+ attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1),
+ dtype=attn_mask.dtype,
+ device=attn_mask.device)], dim=1)
+ if key_padding_mask is not None:
+ key_padding_mask = torch.cat(
+ [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+ dtype=key_padding_mask.dtype,
+ device=key_padding_mask.device)], dim=1)
+
+ attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+ assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+ ######### ADDITION OF RPR ###########
+ if(rpr_mat is not None):
+ rpr_mat = _get_valid_embedding(rpr_mat, q.shape[1], k.shape[1])
+ qe = torch.einsum("hld,md->hlm", q, rpr_mat)
+ srel = _skew(qe)
+ attn_output_weights += srel
+
+ if attn_mask is not None:
+ attn_mask = attn_mask.unsqueeze(0)
+ attn_output_weights += attn_mask
+
+ if key_padding_mask is not None:
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+ attn_output_weights = attn_output_weights.masked_fill(
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
+ float('-inf'),
+ )
+ attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+ attn_output_weights = softmax(
+ attn_output_weights, dim=-1)
+
+ attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+
+ attn_output = torch.bmm(attn_output_weights, v)
+ assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+ attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+ attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+ if need_weights:
+ # average attention weights over heads
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+ return attn_output, attn_output_weights.sum(dim=1) / num_heads
+ else:
+ return attn_output, None
+
+def _get_valid_embedding(Er, len_q, len_k):
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Gets valid embeddings based on max length of RPR attention
+ ----------
+ """
+
+ len_e = Er.shape[0]
+ start = max(0, len_e - len_q)
+ return Er[start:, :]
+
+def _skew(qe):
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Performs the skew optimized RPR computation (https://arxiv.org/abs/1809.04281)
+ ----------
+ """
+ sz = qe.shape[1]
+ mask = (torch.triu(torch.ones(sz, sz).to(qe.device)) == 1).float().flip(0)
+
+ qe = mask * qe
+ qe = F.pad(qe, (1,0, 0,0, 0,0))
+ qe = torch.reshape(qe, (qe.shape[0], qe.shape[2], qe.shape[1]))
+
+ srel = qe[:, 1:, :]
+ return srel
diff --git a/model/video_music_transformer.py b/model/video_music_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..146e8ba81c1e26d3b70e1f72287cc8a4793ff8a4
--- /dev/null
+++ b/model/video_music_transformer.py
@@ -0,0 +1,205 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+import numpy as np
+from utilities.constants import *
+from utilities.device import get_device
+from .positional_encoding import PositionalEncoding
+from .rpr import TransformerDecoderRPR, TransformerDecoderLayerRPR
+from datetime import datetime
+import json
+
+
+class VideoMusicTransformer(nn.Module):
+ def __init__(self, n_layers=6, num_heads=8, d_model=512, dim_feedforward=1024,
+ dropout=0.1, max_sequence_midi =2048, max_sequence_video=300, max_sequence_chord=300, total_vf_dim = 0, rpr=False):
+ super(VideoMusicTransformer, self).__init__()
+ self.nlayers = n_layers
+ self.nhead = num_heads
+ self.d_model = d_model
+ self.d_ff = dim_feedforward
+ self.dropout = dropout
+ self.max_seq_midi = max_sequence_midi
+ self.max_seq_video = max_sequence_video
+ self.max_seq_chord = max_sequence_chord
+ self.rpr = rpr
+
+ # Input embedding for video and music features
+ self.embedding = nn.Embedding(CHORD_SIZE, self.d_model)
+ self.embedding_root = nn.Embedding(CHORD_ROOT_SIZE, self.d_model)
+ self.embedding_attr = nn.Embedding(CHORD_ATTR_SIZE, self.d_model)
+
+ self.total_vf_dim = total_vf_dim
+ self.Linear_vis = nn.Linear(self.total_vf_dim, self.d_model)
+ self.Linear_chord = nn.Linear(self.d_model+1, self.d_model)
+
+ # Positional encoding
+ self.positional_encoding = PositionalEncoding(self.d_model, self.dropout, self.max_seq_chord)
+ self.positional_encoding_video = PositionalEncoding(self.d_model, self.dropout, self.max_seq_video)
+
+ # Add condition (minor or major)
+ self.condition_linear = nn.Linear(1, self.d_model)
+
+ # Base transformer
+ if(not self.rpr):
+ self.transformer = nn.Transformer(
+ d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+ num_decoder_layers=self.nlayers, dropout=self.dropout, # activation=self.ff_activ,
+ dim_feedforward=self.d_ff
+ )
+ # RPR Transformer
+ else:
+ decoder_norm = LayerNorm(self.d_model)
+ decoder_layer = TransformerDecoderLayerRPR(self.d_model, self.nhead, self.d_ff, self.dropout, er_len=self.max_seq_chord)
+ decoder = TransformerDecoderRPR(decoder_layer, self.nlayers, decoder_norm)
+ self.transformer = nn.Transformer(
+ d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+ num_decoder_layers=self.nlayers, dropout=self.dropout, # activation=self.ff_activ,
+ dim_feedforward=self.d_ff, custom_decoder=decoder
+ )
+
+ self.Wout = nn.Linear(self.d_model, CHORD_SIZE)
+ self.Wout_root = nn.Linear(self.d_model, CHORD_ROOT_SIZE)
+ self.Wout_attr = nn.Linear(self.d_model, CHORD_ATTR_SIZE)
+ self.softmax = nn.Softmax(dim=-1)
+ self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+ def forward(self, x, x_root, x_attr, feature_semantic_list, feature_key, feature_scene_offset, feature_motion, feature_emotion, mask=True):
+ if(mask is True):
+ mask = self.transformer.generate_square_subsequent_mask(x.shape[1]).to(self.device)
+ else:
+ mask = None
+
+ x_root = self.embedding_root(x_root)
+ x_attr = self.embedding_attr(x_attr)
+ x = x_root + x_attr
+
+ feature_key_padded = torch.full((x.shape[0], x.shape[1], 1), feature_key.item())
+ feature_key_padded = feature_key_padded.to(self.device)
+ x = torch.cat([x, feature_key_padded], dim=-1)
+
+ xf = self.Linear_chord(x)
+
+ ### Video (SemanticList + SceneOffset + Motion + Emotion) (ENCODER) ###
+ vf_concat = feature_semantic_list[0].float()
+
+ for i in range(1, len(feature_semantic_list)):
+ vf_concat = torch.cat( (vf_concat, feature_semantic_list[i].float()), dim=2)
+
+ vf_concat = torch.cat([vf_concat, feature_scene_offset.unsqueeze(-1).float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+ vf_concat = torch.cat([vf_concat, feature_motion.unsqueeze(-1).float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+ vf_concat = torch.cat([vf_concat, feature_emotion.float()], dim=-1) # -> (max_seq_video, batch_size, d_model+1)
+ vf = self.Linear_vis(vf_concat)
+
+ ### POSITIONAL ENCODING ###
+
+ xf = xf.permute(1,0,2) # -> (max_seq-1, batch_size, d_model)
+ vf = vf.permute(1,0,2) # -> (max_seq_video, batch_size, d_model)
+
+ xf = self.positional_encoding(xf)
+ vf = self.positional_encoding_video(vf)
+
+ ### TRANSFORMER ###
+ x_out = self.transformer(src=vf, tgt=xf, tgt_mask=mask)
+ x_out = x_out.permute(1,0,2)
+
+ if IS_SEPERATED:
+ y_root = self.Wout_root(x_out)
+ y_attr = self.Wout_attr(x_out)
+ del mask
+ return y_root, y_attr
+ else:
+ y = self.Wout(x_out)
+ del mask
+ return y
+
+ def generate(self, feature_semantic_list = [], feature_key=None, feature_scene_offset=None, feature_motion=None, feature_emotion=None,
+ primer=None, primer_root=None, primer_attr=None, target_seq_length=300, beam=0,
+ beam_chance=1.0, max_conseq_N = 0, max_conseq_chord = 2):
+
+ assert (not self.training), "Cannot generate while in training mode"
+ print("Generating sequence of max length:", target_seq_length)
+
+ with open('dataset/vevo_meta/chord_inv.json') as json_file:
+ chordInvDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_root.json') as json_file:
+ chordRootDic = json.load(json_file)
+ with open('dataset/vevo_meta/chord_attr.json') as json_file:
+ chordAttrDic = json.load(json_file)
+
+ gen_seq = torch.full((1,target_seq_length), CHORD_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+ gen_seq_root = torch.full((1,target_seq_length), CHORD_ROOT_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+ gen_seq_attr = torch.full((1,target_seq_length), CHORD_ATTR_PAD, dtype=TORCH_LABEL_TYPE, device=self.device)
+
+ num_primer = len(primer)
+ gen_seq[..., :num_primer] = primer.type(TORCH_LABEL_TYPE).to(self.device)
+ gen_seq_root[..., :num_primer] = primer_root.type(TORCH_LABEL_TYPE).to(self.device)
+ gen_seq_attr[..., :num_primer] = primer_attr.type(TORCH_LABEL_TYPE).to(self.device)
+
+ cur_i = num_primer
+ while(cur_i < target_seq_length):
+ y = self.softmax( self.forward( gen_seq[..., :cur_i], gen_seq_root[..., :cur_i], gen_seq_attr[..., :cur_i],
+ feature_semantic_list, feature_key, feature_scene_offset, feature_motion, feature_emotion) )[..., :CHORD_END]
+
+ token_probs = y[:, cur_i-1, :]
+ if(beam == 0):
+ beam_ran = 2.0
+ else:
+ beam_ran = random.uniform(0,1)
+ if(beam_ran <= beam_chance):
+ token_probs = token_probs.flatten()
+ top_res, top_i = torch.topk(token_probs, beam)
+ beam_rows = top_i // CHORD_SIZE
+ beam_cols = top_i % CHORD_SIZE
+ gen_seq = gen_seq[beam_rows, :]
+ gen_seq[..., cur_i] = beam_cols
+ else:
+ # token_probs.shape : [1, 157]
+ # 0: N, 1: C, ... , 156: B:maj7
+ # 157 chordEnd 158 padding
+ if max_conseq_N == 0:
+ token_probs[0][0] = 0.0
+ isMaxChord = True
+ if cur_i >= max_conseq_chord :
+ preChord = gen_seq[0][cur_i-1].item()
+ for k in range (1, max_conseq_chord):
+ if preChord != gen_seq[0][cur_i-1-k].item():
+ isMaxChord = False
+ else:
+ isMaxChord = False
+
+ if isMaxChord:
+ preChord = gen_seq[0][cur_i-1].item()
+ token_probs[0][preChord] = 0.0
+
+ distrib = torch.distributions.categorical.Categorical(probs=token_probs)
+ next_token = distrib.sample()
+ gen_seq[:, cur_i] = next_token
+ gen_chord = chordInvDic[ str( next_token.item() ) ]
+
+ chord_arr = gen_chord.split(":")
+ if len(chord_arr) == 1:
+ chordRootID = chordRootDic[chord_arr[0]]
+ chordAttrID = 1
+ chordRootID = torch.tensor([chordRootID]).to(self.device)
+ chordAttrID = torch.tensor([chordAttrID]).to(self.device)
+ gen_seq_root[:, cur_i] = chordRootID
+ gen_seq_attr[:, cur_i] = chordAttrID
+ elif len(chord_arr) == 2:
+ chordRootID = chordRootDic[chord_arr[0]]
+ chordAttrID = chordAttrDic[chord_arr[1]]
+ chordRootID = torch.tensor([chordRootID]).to(self.device)
+ chordAttrID = torch.tensor([chordAttrID]).to(self.device)
+ gen_seq_root[:, cur_i] = chordRootID
+ gen_seq_attr[:, cur_i] = chordAttrID
+
+ # Let the transformer decide to end if it wants to
+ if(next_token == CHORD_END):
+ print("Model called end of sequence at:", cur_i, "/", target_seq_length)
+ break
+ cur_i += 1
+ if(cur_i % 50 == 0):
+ print(cur_i, "/", target_seq_length)
+ return gen_seq[:, :cur_i]
+
diff --git a/model/video_regression.py b/model/video_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aaf4968ed76b5170afd3a2ba06bcfd76a129dd0
--- /dev/null
+++ b/model/video_regression.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+import numpy as np
+from utilities.constants import *
+from utilities.device import get_device
+from datetime import datetime
+
+import torch.nn.functional as F
+
+class VideoRegression(nn.Module):
+ def __init__(self, n_layers=2, d_model=64, dropout=0.1, max_sequence_video=300, total_vf_dim = 0, regModel="bilstm"):
+ super(VideoRegression, self).__init__()
+ self.nlayers = n_layers
+ self.d_model = d_model
+ self.dropout = dropout
+ self.max_seq_video = max_sequence_video
+ self.total_vf_dim = total_vf_dim
+ self.regModel = regModel
+
+ self.bilstm = nn.LSTM(self.total_vf_dim, self.d_model, self.nlayers, bidirectional=True)
+ self.bigru = nn.GRU(self.total_vf_dim, self.d_model, self.nlayers, bidirectional=True)
+ self.bifc = nn.Linear(self.d_model * 2, 2)
+
+ self.lstm = nn.LSTM(self.total_vf_dim, self.d_model, self.nlayers)
+ self.gru = nn.GRU(self.total_vf_dim, self.d_model, self.nlayers)
+ self.fc = nn.Linear(self.d_model, 2)
+
+ self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+
+ def forward(self, feature_semantic_list, feature_scene_offset, feature_motion, feature_emotion):
+ ### Video (SemanticList + SceneOffset + Motion + Emotion) (ENCODER) ###
+ vf_concat = feature_semantic_list[0].float()
+ for i in range(1, len(feature_semantic_list)):
+ vf_concat = torch.cat( (vf_concat, feature_semantic_list[i].float()), dim=2)
+
+ vf_concat = torch.cat([vf_concat, feature_scene_offset.unsqueeze(-1).float()], dim=-1)
+ vf_concat = torch.cat([vf_concat, feature_motion.unsqueeze(-1).float()], dim=-1)
+ vf_concat = torch.cat([vf_concat, feature_emotion.float()], dim=-1)
+
+ vf_concat = vf_concat.permute(1,0,2)
+ vf_concat = F.dropout(vf_concat, p=self.dropout, training=self.training)
+
+ if self.regModel == "bilstm":
+ out, _ = self.bilstm(vf_concat)
+ out = out.permute(1,0,2)
+ out = self.bifc(out)
+ elif self.regModel == "bigru":
+ out, _ = self.bigru(vf_concat)
+ out = out.permute(1,0,2)
+ out = self.bifc(out)
+ elif self.regModel == "lstm":
+ out, _ = self.lstm(vf_concat)
+ out = out.permute(1,0,2)
+ out = self.fc(out)
+ elif self.regModel == "gru":
+ out, _ = self.gru(vf_concat)
+ out = out.permute(1,0,2)
+ out = self.fc(out)
+ return out
+
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..61b2c6ce9ffb278c58aadd9cfe68cec1ae3ba1aa
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,2 @@
+ffmpeg
+fluidsynth
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ab0c3868c755f2078a1e8e382b979cd54c792751
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,24 @@
+torchvision==0.9.0
+torch==1.8.0
+clip @ git+https://github.com/openai/CLIP.git
+Cython==3.0.5
+numpy==1.19.5
+coloredlogs==15.0.1
+ffmpeg_python==0.2.0
+ftfy==6.1.1
+matplotlib==3.5.3
+midi2audio==0.1.1
+MIDIUtil==1.2.1
+moviepy==1.0.3
+music21==7.3.3
+opencv_python==4.7.0.72
+pandas==1.3.5
+Pillow==8.4.0
+pretty_midi==0.2.9
+pydub==0.25.1
+regex==2022.10.31
+scenedetect==0.6.1
+scikit_learn==1.0.2
+scipy==1.7.3
+gradio==4.7.1
+pyfluidsynth
diff --git a/saved_models/AMT/README.md b/saved_models/AMT/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..84647b908c040cd170233e1b0c1289fd238dfa49
--- /dev/null
+++ b/saved_models/AMT/README.md
@@ -0,0 +1 @@
+put pickle files in this directory
diff --git a/saved_models/AMT/best_loss_weights.pickle b/saved_models/AMT/best_loss_weights.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..25d01b2be9250cc3af42111c82e7d9b2234203f6
--- /dev/null
+++ b/saved_models/AMT/best_loss_weights.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:859f0fac92c6d4ac84446983cd138ca8d625a41e1854edbd86ea29a14f0aad28
+size 131375779
diff --git a/saved_models/AMT/best_rmse_weights.pickle b/saved_models/AMT/best_rmse_weights.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..f6f6e3936b81fff9856628aa8d79b5c8292ca264
--- /dev/null
+++ b/saved_models/AMT/best_rmse_weights.pickle
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3680851df4f8bb7902539bc10b3025eaa7162410826c164b4aec4d44a8c19818
+size 5463439
diff --git a/third_party/midi_processor/__pycache__/processor.cpython-37.pyc b/third_party/midi_processor/__pycache__/processor.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..772264dd0765aa59c39565285b5ba9da3d17753c
Binary files /dev/null and b/third_party/midi_processor/__pycache__/processor.cpython-37.pyc differ
diff --git a/third_party/midi_processor/__pycache__/processor.cpython-38.pyc b/third_party/midi_processor/__pycache__/processor.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ae0acfff0e1fbcac7a233fbfef4811ef831b9b
Binary files /dev/null and b/third_party/midi_processor/__pycache__/processor.cpython-38.pyc differ
diff --git a/third_party/midi_processor/processor.py b/third_party/midi_processor/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6531853b3888c0b50e20fc72ca207dfba5cf49a
--- /dev/null
+++ b/third_party/midi_processor/processor.py
@@ -0,0 +1,261 @@
+import pretty_midi
+
+RANGE_NOTE_ON = 128
+RANGE_NOTE_OFF = 128
+RANGE_VEL = 32
+RANGE_TIME_SHIFT = 100
+
+START_IDX = {
+ 'note_on': 0,
+ 'note_off': RANGE_NOTE_ON,
+ 'time_shift': RANGE_NOTE_ON + RANGE_NOTE_OFF,
+ 'velocity': RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT
+}
+
+class SustainAdapter:
+ def __init__(self, time, type):
+ self.start = time
+ self.type = type
+
+
+class SustainDownManager:
+ def __init__(self, start, end):
+ self.start = start
+ self.end = end
+ self.managed_notes = []
+ self._note_dict = {} # key: pitch, value: note.start
+
+ def add_managed_note(self, note: pretty_midi.Note):
+ self.managed_notes.append(note)
+
+ def transposition_notes(self):
+ for note in reversed(self.managed_notes):
+ try:
+ note.end = self._note_dict[note.pitch]
+ except KeyError:
+ note.end = max(self.end, note.end)
+ self._note_dict[note.pitch] = note.start
+
+
+# Divided note by note_on, note_off
+class SplitNote:
+ def __init__(self, type, time, value, velocity):
+ ## type: note_on, note_off
+ self.type = type
+ self.time = time
+ self.velocity = velocity
+ self.value = value
+
+ def __repr__(self):
+ return '<[SNote] time: {} type: {}, value: {}, velocity: {}>'\
+ .format(self.time, self.type, self.value, self.velocity)
+
+
+class Event:
+ def __init__(self, event_type, value):
+ self.type = event_type
+ self.value = value
+
+ def __repr__(self):
+ return ''.format(self.type, self.value)
+
+ def to_int(self):
+ return START_IDX[self.type] + self.value
+
+ @staticmethod
+ def from_int(int_value):
+ info = Event._type_check(int_value)
+ return Event(info['type'], info['value'])
+
+ @staticmethod
+ def _type_check(int_value):
+ range_note_on = range(0, RANGE_NOTE_ON)
+ range_note_off = range(RANGE_NOTE_ON, RANGE_NOTE_ON+RANGE_NOTE_OFF)
+ range_time_shift = range(RANGE_NOTE_ON+RANGE_NOTE_OFF,RANGE_NOTE_ON+RANGE_NOTE_OFF+RANGE_TIME_SHIFT)
+
+ valid_value = int_value
+
+ if int_value in range_note_on:
+ return {'type': 'note_on', 'value': valid_value}
+ elif int_value in range_note_off:
+ valid_value -= RANGE_NOTE_ON
+ return {'type': 'note_off', 'value': valid_value}
+ elif int_value in range_time_shift:
+ valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF)
+ return {'type': 'time_shift', 'value': valid_value}
+ else:
+ valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT)
+ return {'type': 'velocity', 'value': valid_value}
+
+
+def _divide_note(notes):
+ result_array = []
+ notes.sort(key=lambda x: x.start)
+
+ for note in notes:
+ on = SplitNote('note_on', note.start, note.pitch, note.velocity)
+ off = SplitNote('note_off', note.end, note.pitch, None)
+ result_array += [on, off]
+ return result_array
+
+
+def _merge_note(snote_sequence):
+ note_on_dict = {}
+ result_array = []
+
+ for snote in snote_sequence:
+ # print(note_on_dict)
+ if snote.type == 'note_on':
+ note_on_dict[snote.value] = snote
+ elif snote.type == 'note_off':
+ try:
+ on = note_on_dict[snote.value]
+ off = snote
+ if off.time - on.time == 0:
+ continue
+ result = pretty_midi.Note(on.velocity, snote.value, on.time, off.time)
+ result_array.append(result)
+ except:
+ print('info removed pitch: {}'.format(snote.value))
+ return result_array
+
+
+def _snote2events(snote: SplitNote, prev_vel: int):
+ result = []
+ if snote.velocity is not None:
+ modified_velocity = snote.velocity // 4
+ if prev_vel != modified_velocity:
+ result.append(Event(event_type='velocity', value=modified_velocity))
+ result.append(Event(event_type=snote.type, value=snote.value))
+ return result
+
+
+def _event_seq2snote_seq(event_sequence):
+ timeline = 0
+ velocity = 0
+ snote_seq = []
+
+ for event in event_sequence:
+ if event.type == 'time_shift':
+ timeline += ((event.value+1) / 100)
+ if event.type == 'velocity':
+ velocity = event.value * 4
+ else:
+ snote = SplitNote(event.type, timeline, event.value, velocity)
+ snote_seq.append(snote)
+ return snote_seq
+
+
+def _make_time_sift_events(prev_time, post_time):
+ time_interval = int(round((post_time - prev_time) * 100))
+ results = []
+ while time_interval >= RANGE_TIME_SHIFT:
+ results.append(Event(event_type='time_shift', value=RANGE_TIME_SHIFT-1))
+ time_interval -= RANGE_TIME_SHIFT
+ if time_interval == 0:
+ return results
+ else:
+ return results + [Event(event_type='time_shift', value=time_interval-1)]
+
+
+def _control_preprocess(ctrl_changes):
+ sustains = []
+
+ manager = None
+ for ctrl in ctrl_changes:
+ if ctrl.value >= 64 and manager is None:
+ # sustain down
+ manager = SustainDownManager(start=ctrl.time, end=None)
+ elif ctrl.value < 64 and manager is not None:
+ # sustain up
+ manager.end = ctrl.time
+ sustains.append(manager)
+ manager = None
+ elif ctrl.value < 64 and len(sustains) > 0:
+ sustains[-1].end = ctrl.time
+ return sustains
+
+
+def _note_preprocess(susteins, notes):
+ note_stream = []
+
+ if susteins: # if the midi file has sustain controls
+ for sustain in susteins:
+ for note_idx, note in enumerate(notes):
+ if note.start < sustain.start:
+ note_stream.append(note)
+ elif note.start > sustain.end:
+ notes = notes[note_idx:]
+ sustain.transposition_notes()
+ break
+ else:
+ sustain.add_managed_note(note)
+
+ for sustain in susteins:
+ note_stream += sustain.managed_notes
+
+ else: # else, just push everything into note stream
+ for note_idx, note in enumerate(notes):
+ note_stream.append(note)
+
+ note_stream.sort(key= lambda x: x.start)
+ return note_stream
+
+
+def encode_midi(file_path):
+ events = []
+ notes = []
+ mid = pretty_midi.PrettyMIDI(midi_file=file_path)
+
+ for inst in mid.instruments:
+ inst_notes = inst.notes
+ # ctrl.number is the number of sustain control. If you want to know abour the number type of control,
+ # see https://www.midi.org/specifications-old/item/table-3-control-change-messages-data-bytes-2
+ ctrls = _control_preprocess([ctrl for ctrl in inst.control_changes if ctrl.number == 64])
+ notes += _note_preprocess(ctrls, inst_notes)
+
+ dnotes = _divide_note(notes)
+ # print(dnotes)
+ dnotes.sort(key=lambda x: x.time)
+ # print('sorted:')
+ # print(dnotes)
+ cur_time = 0
+ cur_vel = 0
+ for snote in dnotes:
+ events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+ events += _snote2events(snote=snote, prev_vel=cur_vel)
+ # events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+ cur_time = snote.time
+ cur_vel = snote.velocity
+
+ return [e.to_int() for e in events]
+
+def decode_midi(idx_array, file_path=None):
+ event_sequence = [Event.from_int(idx) for idx in idx_array]
+ # print(event_sequence)
+ snote_seq = _event_seq2snote_seq(event_sequence)
+ note_seq = _merge_note(snote_seq)
+ note_seq.sort(key=lambda x:x.start)
+
+ mid = pretty_midi.PrettyMIDI()
+ # if want to change instument, see https://www.midi.org/specifications/item/gm-level-1-sound-set
+ instument = pretty_midi.Instrument(1, False, "Developed By Jaeyong Kang")
+ instument.notes = note_seq
+
+ mid.instruments.append(instument)
+ if file_path is not None:
+ mid.write(file_path)
+ return mid
+
+# if __name__ == '__main__':
+# encoded = encode_midi('bin/ADIG04.mid')
+# print(encoded)
+# decided = decode_midi(encoded,file_path='bin/test.mid')
+
+# ins = pretty_midi.PrettyMIDI('bin/ADIG04.mid')
+# print(ins)
+# print(ins.instruments[0])
+# for i in ins.instruments:
+# print(i.control_changes)
+# print(i.notes)
+
diff --git a/utilities/__init__.py b/utilities/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utilities/__pycache__/__init__.cpython-37.pyc b/utilities/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1c917f3725cc057a73fb1c8c56035d3f72f3df3
Binary files /dev/null and b/utilities/__pycache__/__init__.cpython-37.pyc differ
diff --git a/utilities/__pycache__/__init__.cpython-38.pyc b/utilities/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413f568d22a4d6720f04f2fd21bac47445825776
Binary files /dev/null and b/utilities/__pycache__/__init__.cpython-38.pyc differ
diff --git a/utilities/__pycache__/argument_funcs.cpython-37.pyc b/utilities/__pycache__/argument_funcs.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3d8762b8a135a44f00bb9317ba234f5528c3154
Binary files /dev/null and b/utilities/__pycache__/argument_funcs.cpython-37.pyc differ
diff --git a/utilities/__pycache__/chord_to_midi.cpython-37.pyc b/utilities/__pycache__/chord_to_midi.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29aea4e5b863c4c6cb365b6ac95a75c2ed930d2c
Binary files /dev/null and b/utilities/__pycache__/chord_to_midi.cpython-37.pyc differ
diff --git a/utilities/__pycache__/chord_to_midi.cpython-38.pyc b/utilities/__pycache__/chord_to_midi.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f66382aa7a37bf413c1f523e77cd0d5b17ad32a
Binary files /dev/null and b/utilities/__pycache__/chord_to_midi.cpython-38.pyc differ
diff --git a/utilities/__pycache__/constants.cpython-37.pyc b/utilities/__pycache__/constants.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..152e8575802dc8868fa801e9fba3364833e38729
Binary files /dev/null and b/utilities/__pycache__/constants.cpython-37.pyc differ
diff --git a/utilities/__pycache__/constants.cpython-38.pyc b/utilities/__pycache__/constants.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc612d3513fae337d93d7d0dbc3234e792e663ed
Binary files /dev/null and b/utilities/__pycache__/constants.cpython-38.pyc differ
diff --git a/utilities/__pycache__/device.cpython-37.pyc b/utilities/__pycache__/device.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b3105df0fb1ed22b15ba7e1dc8b77112dbeba3c
Binary files /dev/null and b/utilities/__pycache__/device.cpython-37.pyc differ
diff --git a/utilities/__pycache__/device.cpython-38.pyc b/utilities/__pycache__/device.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc932b1e2d496ed28d13778dd032e4ec6d5c5f8
Binary files /dev/null and b/utilities/__pycache__/device.cpython-38.pyc differ
diff --git a/utilities/argument_funcs.py b/utilities/argument_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aea43e4d23f0bc550cacd5a364a119af22d7999
--- /dev/null
+++ b/utilities/argument_funcs.py
@@ -0,0 +1,275 @@
+import argparse
+from .constants import *
+
+version = VERSION
+split_ver = SPLIT_VER
+split_path = "split_" + split_ver
+
+def parse_train_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+
+ parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+ parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+
+ parser.add_argument("-output_dir", type=str, default="./saved_models", help="Folder to save model weights. Saves one every epoch")
+
+ parser.add_argument("-weight_modulus", type=int, default=1, help="How often to save epoch weights (ex: value of 10 means save every 10 epochs)")
+ parser.add_argument("-print_modulus", type=int, default=1, help="How often to print train results for a batch (batch loss, learn rate, etc.)")
+ parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+ parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+ parser.add_argument("--no_tensorboard", action="store_true", help="Turns off tensorboard result reporting")
+ parser.add_argument("-continue_weights", type=str, default=None, help="Model weights to continue training based on")
+ parser.add_argument("-continue_epoch", type=int, default=None, help="Epoch the continue_weights model was at")
+ parser.add_argument("-lr", type=float, default=None, help="Constant learn rate. Leave as None for a custom scheduler.")
+ parser.add_argument("-ce_smoothing", type=float, default=None, help="Smoothing parameter for smoothed cross entropy loss (defaults to no smoothing)")
+ parser.add_argument("-batch_size", type=int, default=1, help="Batch size to use")
+ parser.add_argument("-epochs", type=int, default=5, help="Number of epochs to use")
+
+ parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+ parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+ parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum video sequence to consider")
+
+ parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+ parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+ parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+ parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+ parser.add_argument("-dropout", type=float, default=0.1, help="Dropout rate")
+
+ parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+ if IS_VIDEO:
+ parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+ else:
+ parser.add_argument("-vis_models", type=str, default="", help="...")
+
+ parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+ parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+ return parser.parse_args()
+
+def print_train_args(args):
+ print(SEPERATOR)
+
+ print("dataset_dir:", args.dataset_dir )
+
+ print("input_dir_music:", args.input_dir_music)
+ print("input_dir_video:", args.input_dir_video)
+
+ print("output_dir:", args.output_dir)
+
+ print("weight_modulus:", args.weight_modulus)
+ print("print_modulus:", args.print_modulus)
+ print("")
+ print("n_workers:", args.n_workers)
+ print("force_cpu:", args.force_cpu)
+ print("tensorboard:", not args.no_tensorboard)
+ print("")
+ print("continue_weights:", args.continue_weights)
+ print("continue_epoch:", args.continue_epoch)
+ print("")
+ print("lr:", args.lr)
+ print("ce_smoothing:", args.ce_smoothing)
+ print("batch_size:", args.batch_size)
+ print("epochs:", args.epochs)
+ print("")
+ print("rpr:", args.rpr)
+
+ print("max_sequence_midi:", args.max_sequence_midi)
+ print("max_sequence_video:", args.max_sequence_video)
+ print("max_sequence_chord:", args.max_sequence_chord)
+
+ print("n_layers:", args.n_layers)
+ print("num_heads:", args.num_heads)
+ print("d_model:", args.d_model)
+ print("")
+ print("dim_feedforward:", args.dim_feedforward)
+ print("dropout:", args.dropout)
+ print("is_video:", args.is_video)
+
+ print(SEPERATOR)
+ print("")
+
+def parse_eval_args():
+ if IS_VIDEO:
+ modelpath = "./saved_models/AMT/best_acc_weights.pickle"
+ # modelpath = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results/best_loss_weights.pickle"
+ else:
+ modelpath = "./saved_models/"+version+ "/no_video/results/best_acc_weights.pickle"
+
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+
+ parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+ parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+
+ parser.add_argument("-model_weights", type=str, default= modelpath, help="Pickled model weights file saved with torch.save and model.state_dict()")
+
+ parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+ parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+ parser.add_argument("-batch_size", type=int, default=1, help="Batch size to use")
+
+ parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+ parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+ parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum video sequence to consider")
+
+ parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+ parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+ parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+ parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+
+ parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+ if IS_VIDEO:
+ parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+ else:
+ parser.add_argument("-vis_models", type=str, default="", help="...")
+
+ parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+ parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+ return parser.parse_args()
+
+def print_eval_args(args):
+ print(SEPERATOR)
+ print("input_dir_music:", args.input_dir_music)
+ print("input_dir_video:", args.input_dir_video)
+
+ print("model_weights:", args.model_weights)
+ print("n_workers:", args.n_workers)
+ print("force_cpu:", args.force_cpu)
+ print("")
+ print("batch_size:", args.batch_size)
+ print("")
+ print("rpr:", args.rpr)
+
+ print("max_sequence_midi:", args.max_sequence_midi)
+ print("max_sequence_video:", args.max_sequence_video)
+ print("max_sequence_chord:", args.max_sequence_chord)
+
+ print("n_layers:", args.n_layers)
+ print("num_heads:", args.num_heads)
+ print("d_model:", args.d_model)
+ print("")
+ print("dim_feedforward:", args.dim_feedforward)
+ print(SEPERATOR)
+ print("")
+
+# parse_generate_args
+def parse_generate_args():
+ parser = argparse.ArgumentParser()
+ outputpath = "./output_vevo/"+version
+ if IS_VIDEO:
+ modelpath = "./saved_models/AMT/best_loss_weights.pickle"
+ modelpathReg = "./saved_models/AMT/best_rmse_weights.pickle"
+ # modelpath = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results/best_acc_weights.pickle"
+ # modelpathReg = "./saved_models/"+version+ "/"+VIS_MODELS_PATH+"/results_regression_bigru/best_rmse_weights.pickle"
+ else:
+ modelpath = "./saved_models/"+version+ "/no_video/results/best_loss_weights.pickle"
+ modelpathReg = None
+
+ parser.add_argument("-dataset_dir", type=str, default="./dataset/", help="Folder of VEVO dataset")
+
+ parser.add_argument("-input_dir_music", type=str, default="./dataset/vevo_chord/" + MUSIC_TYPE, help="Folder of video CNN feature files")
+ parser.add_argument("-input_dir_video", type=str, default="./dataset/vevo_vis", help="Folder of video CNN feature files")
+
+ parser.add_argument("-output_dir", type=str, default= outputpath, help="Folder to write generated midi to")
+
+ parser.add_argument("-primer_file", type=str, default=None, help="File path or integer index to the evaluation dataset. Default is to select a random index.")
+ parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+
+ parser.add_argument("-target_seq_length_midi", type=int, default=1024, help="Target length you'd like the midi to be")
+ parser.add_argument("-target_seq_length_chord", type=int, default=300, help="Target length you'd like the midi to be")
+
+ parser.add_argument("-num_prime_midi", type=int, default=256, help="Amount of messages to prime the generator with")
+ parser.add_argument("-num_prime_chord", type=int, default=30, help="Amount of messages to prime the generator with")
+ parser.add_argument("-model_weights", type=str, default=modelpath, help="Pickled model weights file saved with torch.save and model.state_dict()")
+ parser.add_argument("-modelReg_weights", type=str, default=modelpathReg, help="Pickled model weights file saved with torch.save and model.state_dict()")
+
+ parser.add_argument("-beam", type=int, default=0, help="Beam search k. 0 for random probability sample and 1 for greedy")
+
+ parser.add_argument("-max_sequence_midi", type=int, default=2048, help="Maximum midi sequence to consider")
+ parser.add_argument("-max_sequence_video", type=int, default=300, help="Maximum video sequence to consider")
+ parser.add_argument("-max_sequence_chord", type=int, default=300, help="Maximum chord sequence to consider")
+
+ parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+ parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+ parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+ parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+
+ parser.add_argument("-is_video", type=bool, default=IS_VIDEO, help="MusicTransformer or VideoMusicTransformer")
+
+ if IS_VIDEO:
+ parser.add_argument("-vis_models", type=str, default=VIS_MODELS_SORTED, help="...")
+ else:
+ parser.add_argument("-vis_models", type=str, default="", help="...")
+
+ parser.add_argument("-emo_model", type=str, default="6c_l14p", help="...")
+ parser.add_argument("-rpr", type=bool, default=RPR, help="...")
+ parser.add_argument("-test_id", type=str, default=None, help="Dimension of the feedforward layer")
+
+ return parser.parse_args()
+
+def print_generate_args(args):
+
+ print(SEPERATOR)
+ print("input_dir_music:", args.input_dir_music)
+ print("input_dir_video:", args.input_dir_video)
+
+ print("output_dir:", args.output_dir)
+ print("primer_file:", args.primer_file)
+ print("force_cpu:", args.force_cpu)
+ print("")
+
+ print("target_seq_length_midi:", args.target_seq_length_midi)
+ print("target_seq_length_chord:", args.target_seq_length_chord)
+
+ print("num_prime_midi:", args.num_prime_midi)
+ print("num_prime_chord:", args.num_prime_chord)
+
+ print("model_weights:", args.model_weights)
+ print("beam:", args.beam)
+ print("")
+ print("rpr:", args.rpr)
+
+ print("max_sequence_midi:", args.max_sequence_midi)
+ print("max_sequence_video:", args.max_sequence_video)
+ print("max_sequence_chord:", args.max_sequence_chord)
+
+
+ print("n_layers:", args.n_layers)
+ print("num_heads:", args.num_heads)
+ print("d_model:", args.d_model)
+ print("")
+ print("dim_feedforward:", args.dim_feedforward)
+ print("")
+ print("test_id:", args.test_id)
+
+ print(SEPERATOR)
+ print("")
+
+# write_model_params
+def write_model_params(args, output_file):
+ o_stream = open(output_file, "w")
+
+ o_stream.write("rpr: " + str(args.rpr) + "\n")
+ o_stream.write("lr: " + str(args.lr) + "\n")
+ o_stream.write("ce_smoothing: " + str(args.ce_smoothing) + "\n")
+ o_stream.write("batch_size: " + str(args.batch_size) + "\n")
+
+ o_stream.write("max_sequence_midi: " + str(args.max_sequence_midi) + "\n")
+ o_stream.write("max_sequence_video: " + str(args.max_sequence_video) + "\n")
+ o_stream.write("max_sequence_chord: " + str(args.max_sequence_chord) + "\n")
+
+ o_stream.write("n_layers: " + str(args.n_layers) + "\n")
+ o_stream.write("num_heads: " + str(args.num_heads) + "\n")
+ o_stream.write("d_model: " + str(args.d_model) + "\n")
+ o_stream.write("dim_feedforward: " + str(args.dim_feedforward) + "\n")
+ o_stream.write("dropout: " + str(args.dropout) + "\n")
+
+ o_stream.write("is_video: " + str(args.is_video) + "\n")
+ o_stream.write("vis_models: " + str(args.vis_models) + "\n")
+ o_stream.write("input_dir_music: " + str(args.input_dir_music) + "\n")
+ o_stream.write("input_dir_video: " + str(args.input_dir_video) + "\n")
+
+ o_stream.close()
diff --git a/utilities/chord_to_midi.py b/utilities/chord_to_midi.py
new file mode 100644
index 0000000000000000000000000000000000000000..393a43e039905a4d39f71d10371b796280713c9d
--- /dev/null
+++ b/utilities/chord_to_midi.py
@@ -0,0 +1,316 @@
+# ezchord - convert complex chord names to midi notes
+
+import sys
+import math
+import argparse
+from enum import Enum, auto
+from midiutil import MIDIFile
+
+class Mode(Enum):
+ DIM = auto()
+ MIN = auto()
+ MAJ = auto()
+ DOM = auto()
+ AUG = auto()
+ SUS2 = auto()
+ SUS = auto()
+ FIVE = auto()
+
+TEXT_TO_MODE = {
+ "maj": Mode.MAJ,
+ "dim": Mode.DIM,
+ "o": Mode.DIM,
+ "min": Mode.MIN,
+ "m": Mode.MIN,
+ "-": Mode.MIN,
+ "aug": Mode.AUG,
+ "+": Mode.AUG,
+ "sus2": Mode.SUS2,
+ "sus": Mode.SUS,
+ "5": Mode.FIVE,
+ "five": Mode.FIVE
+}
+
+MODE_TO_SHIFT = {
+ Mode.MAJ: {3:0, 5:0},
+ Mode.DOM: {3:0, 5:0},
+ Mode.DIM: {3:-1, 5:-1},
+ Mode.MIN: {3:-1, 5:0},
+ Mode.AUG: {3:0, 5:1},
+ Mode.SUS2: {3:-2, 5:0},
+ Mode.SUS: {3:1, 5:0},
+ Mode.FIVE: {3:3, 5:0},
+}
+
+NOTE_TO_PITCH = {
+ "a": 9,
+ "b": 11,
+ "c": 12,
+ "d": 14,
+ "e": 16,
+ "f": 17,
+ "g": 19
+}
+
+PITCH_TO_NOTE = {}
+
+for note, pitch in NOTE_TO_PITCH.items():
+ PITCH_TO_NOTE[pitch] = note
+
+RM_TO_PITCH = {
+ "vii": 11,
+ "iii": 4,
+ "vi": 9,
+ "iv": 5,
+ "ii": 2,
+ "i": 0,
+ "v": 7
+}
+
+ACC_TO_SHIFT = {
+ "b": -1,
+ "#": 1
+}
+
+SCALE_DEGREE_SHIFT = {
+ 1: 0,
+ 2: 2,
+ 3: 4,
+ 4: 5,
+ 5: 7,
+ 6: 9,
+ 7: 11
+}
+
+def getNumber(string):
+ numStr = ""
+
+ for char in string:
+ if char.isdigit():
+ numStr += char
+
+ if len(numStr) > 0:
+ return int(numStr)
+
+ return
+
+def textToPitch(text, key = "c", voice = True):
+ text = text.lower()
+ isLetter = text[0] in NOTE_TO_PITCH.keys()
+
+ if isLetter:
+ pitch = NOTE_TO_PITCH[text[0]]
+ else:
+ for rm in RM_TO_PITCH.keys():
+ if rm in text:
+ pitch = RM_TO_PITCH[rm] + textToPitch(key)
+ isRomanNumeral = True
+ break
+
+ for i in range(1 if isLetter else 0, len(text)):
+ if text[i] in ACC_TO_SHIFT.keys():
+ pitch += ACC_TO_SHIFT[text[i]]
+
+ return pitch
+
+def pitchToText(pitch):
+ octave = math.floor(pitch / 12)
+ pitch = pitch % 12
+ pitch = pitch + (12 if pitch < 9 else 0)
+ accidental = ""
+
+ if not (pitch in PITCH_TO_NOTE.keys()):
+ pitch = (pitch + 1) % 12
+ pitch = pitch + (12 if pitch < 9 else 0)
+ accidental = "b"
+
+ return PITCH_TO_NOTE[pitch].upper() + accidental + str(octave)
+
+def degreeToShift(deg):
+ return SCALE_DEGREE_SHIFT[(deg - 1) % 7 + 1] + math.floor(deg / 8) * 12
+
+def voice(chords):
+ center = 0
+ voiced_chords = []
+ chord_ct = 0
+ pChord = None
+
+ for i, currChord in enumerate(chords):
+
+ if len(currChord) == 0:
+ voiced_chords.append( [] )
+ continue
+ else:
+ if chord_ct == 0:
+ voiced_chords.append( currChord )
+ chord_ct += 1
+ center = currChord[1] + 3
+ pChord = currChord
+ continue
+
+ prevChord = pChord
+ voiced_chord = []
+
+ for i_, currNote in enumerate(currChord):
+ # Skip bass note
+ if i_ == 0:
+ prevNote = prevChord[0]
+ if abs(currNote - prevNote) > 7:
+ if currNote < prevNote and abs(currNote + 12 - prevNote) < abs(currNote - prevNote):
+ bestVoicing = currNote + 12
+ elif currNote > prevNote and abs(currNote - 12 - prevNote) < abs(currNote - prevNote):
+ bestVoicing = currNote - 12
+ else:
+ bestVoicing = currNote
+
+ voiced_chord.append(bestVoicing)
+ continue
+
+ bestNeighbor = None
+ allowance = -1
+
+ while bestNeighbor == None:
+ allowance += 1
+ for i__, prevNote in enumerate(prevChord):
+ if i__ == 0:
+ continue
+
+ if (
+ abs(currNote - prevNote) % 12 == allowance
+ or abs(currNote - prevNote) % 12 == 12 - allowance
+ ):
+ bestNeighbor = prevNote
+ break
+
+ if currNote <= bestNeighbor:
+ bestVoicing = currNote + math.floor((bestNeighbor - currNote + 6) / 12) * 12
+ else:
+ bestVoicing = currNote + math.ceil((bestNeighbor - currNote - 6) / 12) * 12
+
+ bestVoicing = bestVoicing if (abs(bestVoicing - center) <= 8 or allowance > 2) else currNote
+ voiced_chord.append(bestVoicing)
+
+
+ voiced_chord.sort()
+ voiced_chords.append(voiced_chord)
+ pChord = voiced_chord
+
+ return voiced_chords
+
+class Chord:
+ def __init__(self, string):
+ self.string = string
+ self.degrees = {}
+
+ string += " "
+ self.split = []
+ sect = ""
+
+ notes = list(NOTE_TO_PITCH.keys())
+ rms = list(RM_TO_PITCH.keys())
+ accs = list(ACC_TO_SHIFT.keys())
+ modes = list(TEXT_TO_MODE.keys())
+
+ rootAdded = False
+ modeAdded = False
+
+ isRomanNumeral = False
+ isSlashChord = False
+ isMaj7 = False
+
+ for i in range(0, len(string) - 1):
+ sect += string[i]
+ currChar = string[i].lower()
+ nextChar = string[i+1].lower()
+
+ rootFound = not rootAdded and (currChar in notes+rms+accs and not nextChar in rms+accs)
+ modeFound = False
+ numFound = (currChar.isdigit() and not nextChar.isdigit())
+
+ if (
+ (i == len(string) - 2)
+ or rootFound
+ or numFound
+ or nextChar == "/"
+ or currChar == ")"
+ ):
+ if rootFound:
+ self.root = sect
+ rootAdded = True
+
+ isRomanNumeral = self.root in rms
+ elif sect[0] == "/":
+ # case for 6/9 chords
+ if sect[1] == "9":
+ self.degrees[9] = 0
+ else:
+ isSlashChord = True
+ self.bassnote = sect[1:len(sect)]
+ else:
+ if not modeAdded:
+ for mode in modes:
+ modeFound = mode in sect[0:len(mode)]
+ if modeFound:
+ self.mode = TEXT_TO_MODE[mode]
+ modeAdded = True
+ break
+
+ if not modeAdded:
+ if not isRomanNumeral and str(getNumber(sect)) == sect:
+ self.mode = Mode.DOM
+ modeFound = True
+ modeAdded = True
+
+ deg = getNumber(sect)
+ if deg != None:
+ shift = 0
+
+ for char in sect:
+ if char == "#":
+ shift += 1
+ elif char == "b":
+ shift -= 1
+
+ if (not modeFound) or deg % 2 == 0:
+ self.degrees[deg] = shift
+ elif deg >= 7:
+ for i in range(7, deg+1):
+ if i % 2 != 0:
+ self.degrees[i] = shift
+
+ self.split.append(sect)
+ sect = ""
+
+ if not modeAdded:
+ # Case for minor roman numeral chords
+ if self.root in rms and self.root == self.root.lower():
+ self.mode = Mode.MIN
+ else:
+ self.mode = Mode.DOM
+
+ if not isSlashChord:
+ self.bassnote = self.root
+
+ for sect in self.split:
+ isMaj7 = ("maj" in sect) or isMaj7
+
+ if (7 in self.degrees.keys()) and not isMaj7:
+ self.degrees[7] = -1
+
+ def getMIDI(self, key="c", octave=4):
+ notes = {}
+
+ notes[0] = textToPitch(self.bassnote, key) - 12
+
+ root = textToPitch(self.root, key)
+ notes[1] = root
+ notes[3] = root + degreeToShift(3) + MODE_TO_SHIFT[self.mode][3]
+ notes[5] = root + degreeToShift(5) + MODE_TO_SHIFT[self.mode][5]
+
+ for deg in self.degrees.keys():
+ notes[deg] = root + degreeToShift(deg) + self.degrees[deg]
+
+ for deg in notes.keys():
+ notes[deg] += 12 * octave
+
+ return list(notes.values())
diff --git a/utilities/constants.py b/utilities/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4a44d1f6aef4d591494a7bcbb014579f8014d7
--- /dev/null
+++ b/utilities/constants.py
@@ -0,0 +1,97 @@
+import torch
+from third_party.midi_processor.processor import RANGE_NOTE_ON, RANGE_NOTE_OFF, RANGE_VEL, RANGE_TIME_SHIFT
+
+#Proposed (AMT l0.4)
+# VERSION = "v27_video_rpr_nosep_l0.4"
+VERSION = "AMT"
+
+#Best Baseline (MT)
+# VERSION = "v27_novideo_rpr_nosep"
+
+IS_SEPERATED = False # True : seperated chord quality and root output
+RPR = True
+IS_VIDEO = True
+
+GEN_MODEL = "Video Music Transformer"
+# LSTM
+# Transformer
+# Music Transformer
+# Video Music Transformer
+
+LOSS_LAMBDA = 0.4 # lamda * chord + ( 1-lamda ) * emotion
+
+EMOTION_THRESHOLD = 0.80
+
+VIS_MODELS = "2d/clip_l14p"
+SPLIT_VER = "v1"
+
+MUSIC_TYPE = "lab_v2_norm"
+# - midi_prep
+# - lab
+# - lab_v2
+# - lab_v2_norm
+# ----------------------------------------- #
+
+VIS_ABBR_DIC = {
+ "2d/clip_l14p" : "clip_l14p", # NEW
+}
+
+vis_arr = VIS_MODELS.split(" ")
+vis_arr.sort()
+vis_abbr_path = ""
+for v in vis_arr:
+ vis_abbr_path = vis_abbr_path + "_" + VIS_ABBR_DIC[v]
+vis_abbr_path = vis_abbr_path[1:]
+
+VIS_MODELS_PATH = vis_abbr_path
+VIS_MODELS_SORTED = " ".join(vis_arr)
+
+# CHORD
+CHORD_END = 157
+CHORD_PAD = CHORD_END + 1
+CHORD_SIZE = CHORD_PAD + 1
+
+# CHORD_ROOT
+CHORD_ROOT_END = 13
+CHORD_ROOT_PAD = CHORD_ROOT_END + 1
+CHORD_ROOT_SIZE = CHORD_ROOT_PAD + 1
+
+# CHORD_ATTR
+CHORD_ATTR_END = 14
+CHORD_ATTR_PAD = CHORD_ATTR_END + 1
+CHORD_ATTR_SIZE = CHORD_ATTR_PAD + 1
+
+# SEMANTIC
+SEMANTIC_PAD = 0.0
+
+# SCENE_OFFSET
+SCENE_OFFSET_PAD = 0.0
+
+# MOTION
+MOTION_PAD = 0.0
+
+# EMOTION
+EMOTION_PAD = 0.0
+
+# NOTE_DENSITY
+NOTE_DENSITY_PAD = 0.0
+
+# LOUDNESS
+LOUDNESS_PAD = 0.0
+
+# OTHER
+SEPERATOR = "========================="
+ADAM_BETA_1 = 0.9
+ADAM_BETA_2 = 0.98
+ADAM_EPSILON = 10e-9
+LR_DEFAULT_START = 1.0
+SCHEDULER_WARMUP_STEPS = 4000
+TORCH_FLOAT = torch.float32
+TORCH_INT = torch.int32
+TORCH_LABEL_TYPE = torch.long
+PREPEND_ZEROS_WIDTH = 4
+
+# MIDI
+TOKEN_END = RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_VEL + RANGE_TIME_SHIFT
+TOKEN_PAD = TOKEN_END + 1
+VOCAB_SIZE = TOKEN_PAD + 1
diff --git a/utilities/device.py b/utilities/device.py
new file mode 100755
index 0000000000000000000000000000000000000000..61f0cf29ef9c1698842ef9ebcda48581fa165c34
--- /dev/null
+++ b/utilities/device.py
@@ -0,0 +1,67 @@
+# For all things related to devices
+#### ONLY USE PROVIDED FUNCTIONS, DO NOT USE GLOBAL CONSTANTS ####
+
+import torch
+
+TORCH_CPU_DEVICE = torch.device("cpu")
+
+if(torch.cuda.device_count() > 0):
+ TORCH_CUDA_DEVICE = torch.device("cuda:0")
+else:
+ print("----- WARNING: CUDA devices not detected. This will cause the model to run very slow! -----")
+ print("")
+ TORCH_CUDA_DEVICE = None
+
+USE_CUDA = False
+
+# use_cuda
+def use_cuda(cuda_bool):
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Sets whether to use CUDA (if available), or use the CPU (not recommended)
+ ----------
+ """
+
+ global USE_CUDA
+ USE_CUDA = cuda_bool
+
+# get_device
+def get_device():
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Grabs the default device. Default device is CUDA if available and use_cuda is not False, CPU otherwise.
+ ----------
+ """
+
+ if((not USE_CUDA) or (TORCH_CUDA_DEVICE is None)):
+ return TORCH_CPU_DEVICE
+ else:
+ return TORCH_CUDA_DEVICE
+
+# cuda_device
+def cuda_device():
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Grabs the cuda device (may be None if CUDA is not available)
+ ----------
+ """
+
+ return TORCH_CUDA_DEVICE
+
+# cpu_device
+def cpu_device():
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Grabs the cpu device
+ ----------
+ """
+
+ return TORCH_CPU_DEVICE
diff --git a/utilities/lr_scheduling.py b/utilities/lr_scheduling.py
new file mode 100644
index 0000000000000000000000000000000000000000..6620a03c8d06c7b4dd3b2467db5dfa2b1ac5b9a5
--- /dev/null
+++ b/utilities/lr_scheduling.py
@@ -0,0 +1,58 @@
+import math
+
+# LrStepTracker
+class LrStepTracker:
+ """
+ ----------
+ Author: Ryan Marshall
+ Modified: Damon Gwinn
+ ----------
+ Class for custom learn rate scheduler (to be used by torch.optim.lr_scheduler.LambdaLR).
+
+ Learn rate for each step (batch) given the warmup steps is:
+ lr = [ 1/sqrt(d_model) ] * min[ 1/sqrt(step) , step * (warmup_steps)^-1.5 ]
+
+ This is from Attention is All you Need (https://arxiv.org/abs/1706.03762)
+ ----------
+ """
+
+ def __init__(self, model_dim=512, warmup_steps=4000, init_steps=0):
+ # Store Values
+ self.warmup_steps = warmup_steps
+ self.model_dim = model_dim
+ self.init_steps = init_steps
+
+ # Begin Calculations
+ self.invsqrt_dim = (1 / math.sqrt(model_dim))
+ self.invsqrt_warmup = (1 / (warmup_steps * math.sqrt(warmup_steps)))
+
+ # step
+ def step(self, step):
+ """
+ ----------
+ Author: Ryan Marshall
+ Modified: Damon Gwinn
+ ----------
+ Method to pass to LambdaLR. Increments the step and computes the new learn rate.
+ ----------
+ """
+
+ step += self.init_steps
+ if(step <= self.warmup_steps):
+ return self.invsqrt_dim * self.invsqrt_warmup * step
+ else:
+ invsqrt_step = (1 / math.sqrt(step))
+ return self.invsqrt_dim * invsqrt_step
+
+# get_lr
+def get_lr(optimizer):
+ """
+ ----------
+ Author: Damon Gwinn
+ ----------
+ Hack to get the current learn rate of the model
+ ----------
+ """
+
+ for param_group in optimizer.param_groups:
+ return param_group['lr']
diff --git a/utilities/preprocessing.py b/utilities/preprocessing.py
new file mode 100755
index 0000000000000000000000000000000000000000..e0c59e653f39aad11928d223e0087b4e33e78423
--- /dev/null
+++ b/utilities/preprocessing.py
@@ -0,0 +1,39 @@
+import torch as th
+
+class Normalize(object):
+
+ def __init__(self, mean, std):
+ self.mean = th.FloatTensor(mean).view(1, 3, 1, 1)
+ self.std = th.FloatTensor(std).view(1, 3, 1, 1)
+
+ def __call__(self, tensor):
+ tensor = (tensor - self.mean) / (self.std + 1e-8)
+ return tensor
+
+class Preprocessing(object):
+
+ def __init__(self, type):
+ self.type = type
+ if type == '2d':
+ self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+ elif type == '3d':
+ self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0])
+
+ def _zero_pad(self, tensor, size):
+ n = size - len(tensor) % size
+ if n == size:
+ return tensor
+ else:
+ z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3])
+ return th.cat((tensor, z), 0)
+
+ def __call__(self, tensor):
+ if self.type == '2d':
+ tensor = tensor / 255.0
+ tensor = self.norm(tensor)
+ elif self.type == '3d':
+ tensor = self._zero_pad(tensor, 16)
+ tensor = self.norm(tensor)
+ tensor = tensor.view(-1, 16, 3, 112, 112)
+ tensor = tensor.transpose(1, 2)
+ return tensor
diff --git a/utilities/run_model_regression.py b/utilities/run_model_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6eb1deaf004390dc32a2d80167df032166a9f5d
--- /dev/null
+++ b/utilities/run_model_regression.py
@@ -0,0 +1,120 @@
+import torch
+import time
+
+from .constants import *
+from utilities.device import get_device
+from .lr_scheduling import get_lr
+import torch.nn.functional as F
+
+def train_epoch(cur_epoch, model, dataloader, loss, opt, lr_scheduler=None, print_modulus=1):
+ out = -1
+ model.train()
+ for batch_num, batch in enumerate(dataloader):
+ time_before = time.time()
+ opt.zero_grad()
+
+ feature_semantic_list = []
+ for feature_semantic in batch["semanticList"]:
+ feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+ feature_scene_offset = batch["scene_offset"].to(get_device())
+ feature_motion = batch["motion"].to(get_device())
+ feature_emotion = batch["emotion"].to(get_device())
+
+ feature_note_density = batch["note_density"].to(get_device())
+ feature_loudness = batch["loudness"].to(get_device())
+
+ y = model(
+ feature_semantic_list,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+
+ feature_loudness = feature_loudness.flatten().reshape(-1,1) # (300, 1)
+ feature_note_density = feature_note_density.flatten().reshape(-1,1) # (300, 1)
+ feature_combined = torch.cat((feature_note_density, feature_loudness), dim=1) # (300, 2)
+
+ out = loss.forward(y, feature_combined)
+ out.backward()
+ opt.step()
+
+ if(lr_scheduler is not None):
+ lr_scheduler.step()
+ time_after = time.time()
+ time_took = time_after - time_before
+
+ if((batch_num+1) % print_modulus == 0):
+ print(SEPERATOR)
+ print("Epoch", cur_epoch, " Batch", batch_num+1, "/", len(dataloader))
+ print("LR:", get_lr(opt))
+ print("Train loss:", float(out))
+ print("")
+ print("Time (s):", time_took)
+ print(SEPERATOR)
+ print("")
+ return
+
+def eval_model(model, dataloader, loss):
+ model.eval()
+
+ avg_rmse = -1
+ avg_loss = -1
+ avg_rmse_note_density = -1
+ avg_rmse_loudness = -1
+ with torch.set_grad_enabled(False):
+ n_test = len(dataloader)
+
+ sum_loss = 0.0
+
+ sum_rmse = 0.0
+ sum_rmse_note_density = 0.0
+ sum_rmse_loudness = 0.0
+
+ for batch in dataloader:
+ feature_semantic_list = []
+ for feature_semantic in batch["semanticList"]:
+ feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+ feature_scene_offset = batch["scene_offset"].to(get_device())
+ feature_motion = batch["motion"].to(get_device())
+ feature_emotion = batch["emotion"].to(get_device())
+ feature_loudness = batch["loudness"].to(get_device())
+ feature_note_density = batch["note_density"].to(get_device())
+
+ y = model(
+ feature_semantic_list,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+
+ feature_loudness = feature_loudness.flatten().reshape(-1,1) # (300, 1)
+ feature_note_density = feature_note_density.flatten().reshape(-1,1) # (300, 1)
+ feature_combined = torch.cat((feature_note_density, feature_loudness), dim=1) # (300, 2)
+
+ mse = F.mse_loss(y, feature_combined)
+ rmse = torch.sqrt(mse)
+ sum_rmse += float(rmse)
+
+ y_note_density, y_loudness = torch.split(y, split_size_or_sections=1, dim=1)
+
+ mse_note_density = F.mse_loss(y_note_density, feature_note_density)
+ rmse_note_density = torch.sqrt(mse_note_density)
+ sum_rmse_note_density += float(rmse_note_density)
+
+ mse_loudness = F.mse_loss(y_loudness, feature_loudness)
+ rmse_loudness = torch.sqrt(mse_loudness)
+ sum_rmse_loudness += float(rmse_loudness)
+
+ out = loss.forward(y, feature_combined)
+ sum_loss += float(out)
+
+ avg_loss = sum_loss / n_test
+ avg_rmse = sum_rmse / n_test
+ avg_rmse_note_density = sum_rmse_note_density / n_test
+ avg_rmse_loudness = sum_rmse_loudness / n_test
+
+ return avg_loss, avg_rmse, avg_rmse_note_density, avg_rmse_loudness
diff --git a/utilities/run_model_vevo.py b/utilities/run_model_vevo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b8349d4e25355e75b8543699754fe86752f050f
--- /dev/null
+++ b/utilities/run_model_vevo.py
@@ -0,0 +1,525 @@
+import torch
+import time
+
+from .constants import *
+from utilities.device import get_device
+from .lr_scheduling import get_lr
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+import json
+
+from dataset.vevo_dataset import compute_vevo_accuracy, compute_vevo_correspondence, compute_hits_k, compute_hits_k_root_attr, compute_vevo_accuracy_root_attr, compute_vevo_correspondence_root_attr
+
+def train_epoch(cur_epoch, model, dataloader,
+ train_loss_func, train_loss_emotion_func,
+ opt, lr_scheduler=None, print_modulus=1, isVideo=True):
+
+ loss_chord = -1
+ loss_emotion = -1
+ model.train()
+ for batch_num, batch in enumerate(dataloader):
+ time_before = time.time()
+ opt.zero_grad()
+
+ x = batch["x"].to(get_device())
+ tgt = batch["tgt"].to(get_device())
+ x_root = batch["x_root"].to(get_device())
+ tgt_root = batch["tgt_root"].to(get_device())
+ x_attr = batch["x_attr"].to(get_device())
+ tgt_attr = batch["tgt_attr"].to(get_device())
+ tgt_emotion = batch["tgt_emotion"].to(get_device())
+ tgt_emotion_prob = batch["tgt_emotion_prob"].to(get_device())
+
+ feature_semantic_list = []
+ for feature_semantic in batch["semanticList"]:
+ feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+ feature_key = batch["key"].to(get_device())
+ feature_scene_offset = batch["scene_offset"].to(get_device())
+ feature_motion = batch["motion"].to(get_device())
+ feature_emotion = batch["emotion"].to(get_device())
+
+ if isVideo:
+ # use VideoMusicTransformer
+ if IS_SEPERATED:
+ y_root, y_attr = model(x,
+ x_root,
+ x_attr,
+ feature_semantic_list,
+ feature_key,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ y_root = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+ y_attr = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+
+ tgt_root = tgt_root.flatten()
+ tgt_attr = tgt_attr.flatten()
+
+ tgt_emotion = tgt_emotion.squeeze()
+
+ loss_chord_root = train_loss_func.forward(y_root, tgt_root)
+ loss_chord_attr = train_loss_func.forward(y_attr, tgt_attr)
+ loss_chord = loss_chord_root + loss_chord_attr
+
+ first_14 = tgt_emotion[:, :14]
+ last_2 = tgt_emotion[:, -2:]
+ tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+
+ loss_emotion = train_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+
+ total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+ total_loss.backward()
+ opt.step()
+ if(lr_scheduler is not None):
+ lr_scheduler.step()
+
+ else:
+ #videomusic tran nosep
+ y = model(x,
+ x_root,
+ x_attr,
+ feature_semantic_list,
+ feature_key,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+ tgt = tgt.flatten()
+ tgt_emotion = tgt_emotion.squeeze()
+ loss_chord = train_loss_func.forward(y, tgt)
+ loss_emotion = train_loss_emotion_func.forward(y, tgt_emotion)
+ total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+ total_loss.backward()
+ opt.step()
+ if(lr_scheduler is not None):
+ lr_scheduler.step()
+
+ else:
+ # music transformer
+ if IS_SEPERATED:
+ y_root, y_attr = model(x,
+ x_root,
+ x_attr,
+ feature_key)
+
+ y_root = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+ y_attr = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+
+ tgt_root = tgt_root.flatten()
+ tgt_attr = tgt_attr.flatten()
+
+ tgt_emotion = tgt_emotion.squeeze()
+
+ loss_chord_root = train_loss_func.forward(y_root, tgt_root)
+ loss_chord_attr = train_loss_func.forward(y_attr, tgt_attr)
+
+ loss_chord = loss_chord_root + loss_chord_attr
+ loss_emotion = -1
+
+ total_loss = loss_chord
+ total_loss.backward()
+ opt.step()
+ if(lr_scheduler is not None):
+ lr_scheduler.step()
+ else:
+ # use MusicTransformer (no sep)
+ y = model(x,
+ x_root,
+ x_attr,
+ feature_key)
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+ tgt = tgt.flatten()
+
+ loss_chord = train_loss_func.forward(y, tgt)
+ loss_emotion = -1
+
+ total_loss = loss_chord
+ total_loss.backward()
+
+ opt.step()
+
+ if(lr_scheduler is not None):
+ lr_scheduler.step()
+
+ time_after = time.time()
+ time_took = time_after - time_before
+
+ if((batch_num+1) % print_modulus == 0):
+ print(SEPERATOR)
+ print("Epoch", cur_epoch, " Batch", batch_num+1, "/", len(dataloader))
+ print("LR:", get_lr(opt))
+ print("Train loss (total):", float(total_loss))
+ print("Train loss (chord):", float(loss_chord))
+ print("Train loss (emotion):", float(loss_emotion))
+ print("")
+ print("Time (s):", time_took)
+ print(SEPERATOR)
+ print("")
+ return
+
+def eval_model(model, dataloader,
+ eval_loss_func, eval_loss_emotion_func,
+ isVideo = True, isGenConfusionMatrix=False):
+ model.eval()
+ avg_acc = -1
+ avg_cor = -1
+ avg_acc_cor = -1
+
+ avg_h1 = -1
+ avg_h3 = -1
+ avg_h5 = -1
+
+ avg_loss_chord = -1
+ avg_loss_emotion = -1
+ avg_total_loss = -1
+
+ true_labels = []
+ true_root_labels = []
+ true_attr_labels = []
+
+ pred_labels = []
+ pred_root_labels = []
+ pred_attr_labels = []
+
+ with torch.set_grad_enabled(False):
+ n_test = len(dataloader)
+ n_test_cor = 0
+
+ sum_loss_chord = 0.0
+ sum_loss_emotion = 0.0
+ sum_total_loss = 0.0
+
+ sum_acc = 0.0
+ sum_cor = 0.0
+
+ sum_h1 = 0.0
+ sum_h3 = 0.0
+ sum_h5 = 0.0
+
+ for batch in dataloader:
+ x = batch["x"].to(get_device())
+ tgt = batch["tgt"].to(get_device())
+ x_root = batch["x_root"].to(get_device())
+ tgt_root = batch["tgt_root"].to(get_device())
+ x_attr = batch["x_attr"].to(get_device())
+ tgt_attr = batch["tgt_attr"].to(get_device())
+ tgt_emotion = batch["tgt_emotion"].to(get_device())
+ tgt_emotion_prob = batch["tgt_emotion_prob"].to(get_device())
+
+ feature_semantic_list = []
+ for feature_semantic in batch["semanticList"]:
+ feature_semantic_list.append( feature_semantic.to(get_device()) )
+
+ feature_key = batch["key"].to(get_device())
+ feature_scene_offset = batch["scene_offset"].to(get_device())
+ feature_motion = batch["motion"].to(get_device())
+ feature_emotion = batch["emotion"].to(get_device())
+
+ if isVideo:
+ if IS_SEPERATED:
+ y_root, y_attr = model(x,
+ x_root,
+ x_attr,
+ feature_semantic_list,
+ feature_key,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ sum_acc += float(compute_vevo_accuracy_root_attr(y_root, y_attr, tgt))
+ cor = float(compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+ if cor >= 0 :
+ n_test_cor +=1
+ sum_cor += cor
+
+ sum_h1 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,1))
+ sum_h3 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,3))
+ sum_h5 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,5))
+
+ y_root = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+ y_attr = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+
+ tgt_root = tgt_root.flatten()
+ tgt_attr = tgt_attr.flatten()
+ tgt_emotion = tgt_emotion.squeeze()
+
+ loss_chord_root = eval_loss_func.forward(y_root, tgt_root)
+ loss_chord_attr = eval_loss_func.forward(y_attr, tgt_attr)
+ loss_chord = loss_chord_root + loss_chord_attr
+
+ first_14 = tgt_emotion[:, :14]
+ last_2 = tgt_emotion[:, -2:]
+ tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+
+ loss_emotion = eval_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+ total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+ sum_loss_chord += float(loss_chord)
+ sum_loss_emotion += float(loss_emotion)
+ sum_total_loss += float(total_loss)
+ else:
+ y= model(x,
+ x_root,
+ x_attr,
+ feature_semantic_list,
+ feature_key,
+ feature_scene_offset,
+ feature_motion,
+ feature_emotion)
+
+ sum_acc += float(compute_vevo_accuracy(y, tgt ))
+ cor = float(compute_vevo_correspondence(y, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+ if cor >= 0 :
+ n_test_cor +=1
+ sum_cor += cor
+
+ sum_h1 += float(compute_hits_k(y, tgt,1))
+ sum_h3 += float(compute_hits_k(y, tgt,3))
+ sum_h5 += float(compute_hits_k(y, tgt,5))
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+
+ tgt = tgt.flatten()
+ tgt_root = tgt_root.flatten()
+ tgt_attr = tgt_attr.flatten()
+
+ tgt_emotion = tgt_emotion.squeeze()
+
+ loss_chord = eval_loss_func.forward(y, tgt)
+ loss_emotion = eval_loss_emotion_func.forward(y, tgt_emotion)
+ total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+ sum_loss_chord += float(loss_chord)
+ sum_loss_emotion += float(loss_emotion)
+ sum_total_loss += float(total_loss)
+
+ if isGenConfusionMatrix:
+ pred = y.argmax(dim=1).detach().cpu().numpy()
+ pred_root = []
+ pred_attr = []
+
+ for i in pred:
+ if i == 0:
+ pred_root.append(0)
+ pred_attr.append(0)
+ elif i == 157:
+ pred_root.append(CHORD_ROOT_END)
+ pred_attr.append(CHORD_ATTR_END)
+ elif i == 158:
+ pred_root.append(CHORD_ROOT_PAD)
+ pred_attr.append(CHORD_ATTR_PAD)
+ else:
+ rootindex = int( (i-1)/13 ) + 1
+ attrindex = (i-1)%13 + 1
+ pred_root.append(rootindex)
+ pred_attr.append(attrindex)
+
+ pred_root = np.array(pred_root)
+ pred_attr = np.array(pred_attr)
+
+ true = tgt.detach().cpu().numpy()
+ true_root = tgt_root.detach().cpu().numpy()
+ true_attr = tgt_attr.detach().cpu().numpy()
+
+ pred_labels.extend(pred)
+ pred_root_labels.extend(pred_root)
+ pred_attr_labels.extend(pred_attr)
+
+ true_labels.extend(true)
+ true_root_labels.extend(true_root)
+ true_attr_labels.extend(true_attr)
+ else:
+ if IS_SEPERATED:
+ y_root, y_attr = model(x,
+ x_root,
+ x_attr,
+ feature_key)
+
+ sum_acc += float(compute_vevo_accuracy_root_attr(y_root, y_attr, tgt))
+ cor = float(compute_vevo_correspondence_root_attr(y_root, y_attr, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+ if cor >= 0 :
+ n_test_cor +=1
+ sum_cor += cor
+
+ sum_h1 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,1))
+ sum_h3 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,3))
+ sum_h5 += float(compute_hits_k_root_attr(y_root, y_attr, tgt,5))
+
+ y_root = y_root.reshape(y_root.shape[0] * y_root.shape[1], -1)
+ y_attr = y_attr.reshape(y_attr.shape[0] * y_attr.shape[1], -1)
+
+ tgt_root = tgt_root.flatten()
+ tgt_attr = tgt_attr.flatten()
+ tgt_emotion = tgt_emotion.squeeze()
+
+ loss_chord_root = eval_loss_func.forward(y_root, tgt_root)
+ loss_chord_attr = eval_loss_func.forward(y_attr, tgt_attr)
+ loss_chord = loss_chord_root + loss_chord_attr
+
+ first_14 = tgt_emotion[:, :14]
+ last_2 = tgt_emotion[:, -2:]
+ tgt_emotion_attr = torch.cat((first_14, last_2), dim=1)
+ loss_emotion = eval_loss_emotion_func.forward(y_attr, tgt_emotion_attr)
+
+ total_loss = LOSS_LAMBDA * loss_chord + (1-LOSS_LAMBDA) * loss_emotion
+
+ sum_loss_chord += float(loss_chord)
+ sum_loss_emotion += float(loss_emotion)
+ sum_total_loss += float(total_loss)
+ else:
+ # use MusicTransformer no sep
+ y = model(x,
+ x_root,
+ x_attr,
+ feature_key)
+
+ sum_acc += float(compute_vevo_accuracy(y, tgt ))
+ cor = float(compute_vevo_correspondence(y, tgt, tgt_emotion, tgt_emotion_prob, EMOTION_THRESHOLD))
+
+ if cor >= 0 :
+ n_test_cor +=1
+ sum_cor += cor
+
+ sum_h1 += float(compute_hits_k(y, tgt,1))
+ sum_h3 += float(compute_hits_k(y, tgt,3))
+ sum_h5 += float(compute_hits_k(y, tgt,5))
+
+ tgt_emotion = tgt_emotion.squeeze()
+
+ y = y.reshape(y.shape[0] * y.shape[1], -1)
+ tgt = tgt.flatten()
+ loss_chord = eval_loss_func.forward(y, tgt)
+ loss_emotion = eval_loss_emotion_func.forward(y, tgt_emotion)
+ total_loss = loss_chord
+
+ sum_loss_chord += float(loss_chord)
+ sum_loss_emotion += float(loss_emotion)
+ sum_total_loss += float(total_loss)
+
+ avg_loss_chord = sum_loss_chord / n_test
+ avg_loss_emotion = sum_loss_emotion / n_test
+ avg_total_loss = sum_total_loss / n_test
+
+ avg_acc = sum_acc / n_test
+ avg_cor = sum_cor / n_test_cor
+
+ avg_h1 = sum_h1 / n_test
+ avg_h3 = sum_h3 / n_test
+ avg_h5 = sum_h5 / n_test
+
+ avg_acc_cor = (avg_acc + avg_cor)/ 2.0
+
+ if isGenConfusionMatrix:
+ chordInvDicPath = "./dataset/vevo_meta/chord_inv.json"
+ chordRootInvDicPath = "./dataset/vevo_meta/chord_root_inv.json"
+ chordAttrInvDicPath = "./dataset/vevo_meta/chord_attr_inv.json"
+
+ with open(chordInvDicPath) as json_file:
+ chordInvDic = json.load(json_file)
+ with open(chordRootInvDicPath) as json_file:
+ chordRootInvDic = json.load(json_file)
+ with open(chordAttrInvDicPath) as json_file:
+ chordAttrInvDic = json.load(json_file)
+
+ # Confusion matrix (CHORD)
+ topChordList = []
+ with open("./dataset/vevo_meta/top_chord.txt", encoding = 'utf-8') as f:
+ for line in f:
+ line = line.strip()
+ line_arr = line.split(" ")
+ if len(line_arr) == 3 :
+ chordID = line_arr[1]
+ topChordList.append( int(chordID) )
+ topChordList = np.array(topChordList)
+ topChordList = topChordList[:10]
+ mask = np.isin(true_labels, topChordList)
+ true_labels = np.array(true_labels)[mask]
+ pred_labels = np.array(pred_labels)[mask]
+
+ conf_matrix = confusion_matrix(true_labels, pred_labels, labels=topChordList)
+ label_names = [ chordInvDic[str(label_id)] for label_id in topChordList ]
+
+ plt.figure(figsize=(8, 6))
+ plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+ plt.title("Confusion Matrix")
+ plt.colorbar()
+ tick_marks = np.arange(len(topChordList))
+ plt.xticks(tick_marks, label_names, rotation=45)
+ plt.yticks(tick_marks, label_names)
+ thresh = conf_matrix.max() / 2.0
+ for i in range(conf_matrix.shape[0]):
+ for j in range(conf_matrix.shape[1]):
+ plt.text(j, i, format(conf_matrix[i, j], 'd'),
+ ha="center", va="center",
+ color="white" if conf_matrix[i, j] > thresh else "black")
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+ plt.tight_layout()
+ plt.savefig("confusion_matrix.png")
+ plt.show()
+
+ # Confusion matrix (CHORD ROOT)
+ chordRootList = np.arange(1, 13)
+ conf_matrix = confusion_matrix(true_root_labels, pred_root_labels, labels= chordRootList )
+
+ label_names = [ chordRootInvDic[str(label_id)] for label_id in chordRootList ]
+
+ plt.figure(figsize=(8, 6))
+ plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+ plt.title("Confusion Matrix (Chord root)")
+ plt.colorbar()
+ tick_marks = np.arange(len(chordRootList))
+ plt.xticks(tick_marks, label_names, rotation=45)
+ plt.yticks(tick_marks, label_names)
+ thresh = conf_matrix.max() / 2.0
+ for i in range(conf_matrix.shape[0]):
+ for j in range(conf_matrix.shape[1]):
+ plt.text(j, i, format(conf_matrix[i, j], 'd'),
+ ha="center", va="center",
+ color="white" if conf_matrix[i, j] > thresh else "black")
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+ plt.tight_layout()
+ plt.savefig("confusion_matrix_root.png")
+ plt.show()
+
+ # Confusion matrix (CHORD ATTR)
+ chordAttrList = np.arange(1, 14)
+ conf_matrix = confusion_matrix(true_attr_labels, pred_attr_labels, labels= chordAttrList )
+
+ label_names = [ chordAttrInvDic[str(label_id)] for label_id in chordAttrList ]
+
+ plt.figure(figsize=(8, 6))
+ plt.imshow(conf_matrix, cmap=plt.cm.Blues)
+ plt.title("Confusion Matrix (Chord quality)")
+ plt.colorbar()
+ tick_marks = np.arange(len(chordAttrList))
+ plt.xticks(tick_marks, label_names, rotation=45)
+ plt.yticks(tick_marks, label_names)
+ thresh = conf_matrix.max() / 2.0
+ for i in range(conf_matrix.shape[0]):
+ for j in range(conf_matrix.shape[1]):
+ plt.text(j, i, format(conf_matrix[i, j], 'd'),
+ ha="center", va="center",
+ color="white" if conf_matrix[i, j] > thresh else "black")
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+ plt.tight_layout()
+ plt.savefig("confusion_matrix_quality.png")
+ plt.show()
+
+ return { "avg_total_loss" : avg_total_loss,
+ "avg_loss_chord" : avg_loss_chord,
+ "avg_loss_emotion": avg_loss_emotion,
+ "avg_acc" : avg_acc,
+ "avg_cor" : avg_cor,
+ "avg_acc_cor" : avg_acc_cor,
+ "avg_h1" : avg_h1,
+ "avg_h3" : avg_h3,
+ "avg_h5" : avg_h5 }
+
diff --git a/utilities/video_loader.py b/utilities/video_loader.py
new file mode 100755
index 0000000000000000000000000000000000000000..9261585228a28c563ba974fa49ce02da2a548572
--- /dev/null
+++ b/utilities/video_loader.py
@@ -0,0 +1,83 @@
+import torch as th
+from torch.utils.data import Dataset
+import pandas as pd
+import os
+import numpy as np
+import ffmpeg
+
+class VideoLoader(Dataset):
+ def __init__(
+ self,
+ fileList = [],
+ framerate=1,
+ size=112,
+ centercrop=False,
+ ):
+ #self.csv = pd.read_csv(csv)
+ self.fileList = fileList
+
+ self.centercrop = centercrop
+ self.size = size
+ self.framerate = framerate
+
+ def __len__(self):
+ return len(self.fileList)
+
+ def _get_video_dim(self, video_path):
+ probe = ffmpeg.probe(video_path)
+ video_stream = next((stream for stream in probe['streams']
+ if stream['codec_type'] == 'video'), None)
+ width = int(video_stream['width'])
+ height = int(video_stream['height'])
+ return height, width
+
+ def _get_output_dim(self, h, w):
+ if isinstance(self.size, tuple) and len(self.size) == 2:
+ return self.size
+ elif h >= w:
+ return int(h * self.size / w), self.size
+ else:
+ return self.size, int(w * self.size / h)
+
+ def __getitem__(self, idx):
+
+ video_path = self.fileList[idx]
+ output_file = video_path[:video_path.rfind(".")] + ".npy"
+
+ #video_path = self.csv['video_path'].values[idx]
+ #output_file = self.csv['feature_path'].values[idx]
+
+ if not(os.path.isfile(output_file)) and os.path.isfile(video_path):
+ print('Decoding video: {}'.format(video_path))
+
+
+ try:
+ h, w = self._get_video_dim(video_path)
+ except:
+ print('ffprobe failed at: {}'.format(video_path))
+ return {'video': th.zeros(1), 'input': video_path,
+ 'output': output_file}
+ height, width = self._get_output_dim(h, w)
+ cmd = (
+ ffmpeg
+ .input(video_path)
+ .filter('fps', fps=self.framerate)
+ .filter('scale', width, height)
+ )
+ if self.centercrop:
+ x = int((width - self.size) / 2.0)
+ y = int((height - self.size) / 2.0)
+ cmd = cmd.crop(x, y, self.size, self.size)
+ out, _ = (
+ cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24')
+ .run(capture_stdout=True, quiet=True)
+ )
+ if self.centercrop and isinstance(self.size, int):
+ height, width = self.size, self.size
+ video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
+ video = th.from_numpy(video.astype('float32'))
+ video = video.permute(0, 3, 1, 2)
+ else:
+ video = th.zeros(1)
+
+ return {'video': video, 'input': video_path, 'output': output_file}